Vertex

Paused

App Files Files Community

bibibi12345 commited on May 29, 2025

Commit

b5abced

1 Parent(s): da7a18e

changed openai cot streaming handling. added roundrobin mode for credentials. various refactoring

Browse files

Files changed (2) hide show

app/express_key_manager.py +97 -0
app/openai_handler.py +284 -0

app/express_key_manager.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import random
+from typing import List, Optional, Tuple
+import config as app_config
+class ExpressKeyManager:
+    """
+    Manager for Vertex Express API keys with support for both random and round-robin selection strategies.
+    Similar to CredentialManager but specifically for Express API keys.
+    """
+    def __init__(self):
+        """Initialize the Express Key Manager with API keys from config."""
+        self.express_keys: List[str] = app_config.VERTEX_EXPRESS_API_KEY_VAL
+        self.round_robin_index: int = 0
+    def get_total_keys(self) -> int:
+        """Get the total number of available Express API keys."""
+        return len(self.express_keys)
+    def _get_key_with_index(self, key: str, index: int) -> Tuple[str, int]:
+        """Return a tuple of (key, original_index) for logging purposes."""
+        return (key, index)
+    def get_random_express_key(self) -> Optional[Tuple[str, int]]:
+        """
+        Get a random Express API key.
+        Returns (key, original_index) tuple or None if no keys available.
+        """
+        if not self.express_keys:
+            print("WARNING: No Express API keys available for selection.")
+            return None
+        print(f"DEBUG: Using random Express API key selection strategy.")
+        # Create list of indexed keys
+        indexed_keys = list(enumerate(self.express_keys))
+        # Shuffle to randomize order
+        random.shuffle(indexed_keys)
+        # Return the first key (which is random due to shuffle)
+        original_idx, key = indexed_keys[0]
+        return self._get_key_with_index(key, original_idx)
+    def get_roundrobin_express_key(self) -> Optional[Tuple[str, int]]:
+        """
+        Get an Express API key using round-robin selection.
+        Returns (key, original_index) tuple or None if no keys available.
+        """
+        if not self.express_keys:
+            print("WARNING: No Express API keys available for selection.")
+            return None
+        print(f"DEBUG: Using round-robin Express API key selection strategy.")
+        # Ensure round_robin_index is within bounds
+        if self.round_robin_index >= len(self.express_keys):
+            self.round_robin_index = 0
+        # Get the key at current index
+        key = self.express_keys[self.round_robin_index]
+        original_idx = self.round_robin_index
+        # Move to next index for next call
+        self.round_robin_index = (self.round_robin_index + 1) % len(self.express_keys)
+        return self._get_key_with_index(key, original_idx)
+    def get_express_api_key(self) -> Optional[Tuple[str, int]]:
+        """
+        Get an Express API key based on the configured selection strategy.
+        Checks ROUNDROBIN config and calls the appropriate method.
+        Returns (key, original_index) tuple or None if no keys available.
+        """
+        if app_config.ROUNDROBIN:
+            return self.get_roundrobin_express_key()
+        else:
+            return self.get_random_express_key()
+    def get_all_keys_indexed(self) -> List[Tuple[int, str]]:
+        """
+        Get all Express API keys with their indices.
+        Useful for retry logic where we need to try all keys.
+        Returns list of (original_index, key) tuples.
+        """
+        return list(enumerate(self.express_keys))
+    def refresh_keys(self):
+        """
+        Refresh the Express API keys from config.
+        This allows for dynamic updates if the config is reloaded.
+        """
+        self.express_keys = app_config.VERTEX_EXPRESS_API_KEY_VAL
+        # Reset round-robin index if keys changed
+        if self.round_robin_index >= len(self.express_keys):
+            self.round_robin_index = 0
+        print(f"INFO: Express API keys refreshed. Total keys: {self.get_total_keys()}")

app/openai_handler.py ADDED Viewed

	@@ -0,0 +1,284 @@

+"""
+OpenAI handler module for creating clients and processing OpenAI Direct mode responses.
+This module encapsulates all OpenAI-specific logic that was previously in chat_api.py.
+"""
+import json
+import time
+import asyncio
+from typing import Dict, Any, AsyncGenerator
+from fastapi.responses import JSONResponse, StreamingResponse
+import openai
+from google.auth.transport.requests import Request as AuthRequest
+from models import OpenAIRequest
+from config import VERTEX_REASONING_TAG
+import config as app_config
+from api_helpers import (
+    create_openai_error_response,
+    openai_fake_stream_generator,
+    StreamingReasoningProcessor
+)
+from message_processing import extract_reasoning_by_tags
+from credentials_manager import _refresh_auth
+class OpenAIDirectHandler:
+    """Handles OpenAI Direct mode operations including client creation and response processing."""
+    def __init__(self, credential_manager):
+        self.credential_manager = credential_manager
+        self.safety_settings = [
+            {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "OFF"},
+            {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "OFF"},
+            {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "OFF"},
+            {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "OFF"},
+            {"category": 'HARM_CATEGORY_CIVIC_INTEGRITY', "threshold": 'OFF'}
+        ]
+    def create_openai_client(self, project_id: str, gcp_token: str, location: str = "global") -> openai.AsyncOpenAI:
+        """Create an OpenAI client configured for Vertex AI endpoint."""
+        endpoint_url = (
+            f"https://aiplatform.googleapis.com/v1beta1/"
+            f"projects/{project_id}/locations/{location}/endpoints/openapi"
+        )
+        return openai.AsyncOpenAI(
+            base_url=endpoint_url,
+            api_key=gcp_token,  # OAuth token
+        )
+    def prepare_openai_params(self, request: OpenAIRequest, model_id: str) -> Dict[str, Any]:
+        """Prepare parameters for OpenAI API call."""
+        params = {
+            "model": model_id,
+            "messages": [msg.model_dump(exclude_unset=True) for msg in request.messages],
+            "temperature": request.temperature,
+            "max_tokens": request.max_tokens,
+            "top_p": request.top_p,
+            "stream": request.stream,
+            "stop": request.stop,
+            "seed": request.seed,
+            "n": request.n,
+        }
+        # Remove None values
+        return {k: v for k, v in params.items() if v is not None}
+    def prepare_extra_body(self) -> Dict[str, Any]:
+        """Prepare extra body parameters for OpenAI API call."""
+        return {
+            "extra_body": {
+                'google': {
+                    'safety_settings': self.safety_settings,
+                    'thought_tag_marker': VERTEX_REASONING_TAG
+                }
+            }
+        }
+    async def handle_streaming_response(
+        self,
+        openai_client: openai.AsyncOpenAI,
+        openai_params: Dict[str, Any],
+        openai_extra_body: Dict[str, Any],
+        request: OpenAIRequest
+    ) -> StreamingResponse:
+        """Handle streaming responses for OpenAI Direct mode."""
+        if app_config.FAKE_STREAMING_ENABLED:
+            print(f"INFO: OpenAI Fake Streaming (SSE Simulation) ENABLED for model '{request.model}'.")
+            return StreamingResponse(
+                openai_fake_stream_generator(
+                    openai_client=openai_client,
+                    openai_params=openai_params,
+                    openai_extra_body=openai_extra_body,
+                    request_obj=request,
+                    is_auto_attempt=False
+                ),
+                media_type="text/event-stream"
+            )
+        else:
+            print(f"INFO: OpenAI True Streaming ENABLED for model '{request.model}'.")
+            return StreamingResponse(
+                self._true_stream_generator(openai_client, openai_params, openai_extra_body, request),
+                media_type="text/event-stream"
+            )
+    async def _true_stream_generator(
+        self,
+        openai_client: openai.AsyncOpenAI,
+        openai_params: Dict[str, Any],
+        openai_extra_body: Dict[str, Any],
+        request: OpenAIRequest
+    ) -> AsyncGenerator[str, None]:
+        """Generate true streaming response."""
+        try:
+            # Ensure stream=True is explicitly passed for real streaming
+            openai_params_for_stream = {**openai_params, "stream": True}
+            stream_response = await openai_client.chat.completions.create(
+                **openai_params_for_stream,
+                extra_body=openai_extra_body['extra_body']
+            )
+            # Create processor for tag-based extraction across chunks
+            reasoning_processor = StreamingReasoningProcessor(VERTEX_REASONING_TAG)
+            async for chunk in stream_response:
+                try:
+                    chunk_as_dict = chunk.model_dump(exclude_unset=True, exclude_none=True)
+                    choices = chunk_as_dict.get('choices')
+                    if choices and isinstance(choices, list) and len(choices) > 0:
+                        delta = choices[0].get('delta')
+                        if delta and isinstance(delta, dict):
+                            # Always remove extra_content if present
+                            if 'extra_content' in delta:
+                                del delta['extra_content']
+                            content = delta.get('content', '')
+                            if content:
+                                # Use the processor to extract reasoning
+                                processed_content, current_reasoning = reasoning_processor.process_chunk(content)
+                                # Update delta with processed content
+                                if current_reasoning:
+                                    delta['reasoning_content'] = current_reasoning
+                                if processed_content:
+                                    delta['content'] = processed_content
+                                elif 'content' in delta:
+                                    del delta['content']
+                    yield f"data: {json.dumps(chunk_as_dict)}\n\n"
+                except Exception as chunk_error:
+                    error_msg = f"Error processing OpenAI chunk for {request.model}: {str(chunk_error)}"
+                    print(f"ERROR: {error_msg}")
+                    if len(error_msg) > 1024:
+                        error_msg = error_msg[:1024] + "..."
+                    error_response = create_openai_error_response(500, error_msg, "server_error")
+                    yield f"data: {json.dumps(error_response)}\n\n"
+                    yield "data: [DONE]\n\n"
+                    return
+            # Handle any remaining buffer content
+            if reasoning_processor.tag_buffer and not reasoning_processor.inside_tag:
+                # Output any remaining content
+                final_chunk = {
+                    "id": f"chatcmpl-{int(time.time())}",
+                    "object": "chat.completion.chunk",
+                    "created": int(time.time()),
+                    "model": request.model,
+                    "choices": [{"index": 0, "delta": {"content": reasoning_processor.tag_buffer}, "finish_reason": None}]
+                }
+                yield f"data: {json.dumps(final_chunk)}\n\n"
+            elif reasoning_processor.inside_tag and reasoning_processor.reasoning_buffer:
+                # We were inside a tag but never found the closing tag
+                print(f"WARNING: Unclosed reasoning tag detected. Partial reasoning: {reasoning_processor.reasoning_buffer[:100]}...")
+            yield "data: [DONE]\n\n"
+        except Exception as stream_error:
+            error_msg = str(stream_error)
+            if len(error_msg) > 1024:
+                error_msg = error_msg[:1024] + "..."
+            error_msg_full = f"Error during OpenAI streaming for {request.model}: {error_msg}"
+            print(f"ERROR: {error_msg_full}")
+            error_response = create_openai_error_response(500, error_msg_full, "server_error")
+            yield f"data: {json.dumps(error_response)}\n\n"
+            yield "data: [DONE]\n\n"
+    async def handle_non_streaming_response(
+        self,
+        openai_client: openai.AsyncOpenAI,
+        openai_params: Dict[str, Any],
+        openai_extra_body: Dict[str, Any],
+        request: OpenAIRequest
+    ) -> JSONResponse:
+        """Handle non-streaming responses for OpenAI Direct mode."""
+        try:
+            # Ensure stream=False is explicitly passed
+            openai_params_non_stream = {**openai_params, "stream": False}
+            response = await openai_client.chat.completions.create(
+                **openai_params_non_stream,
+                extra_body=openai_extra_body['extra_body']
+            )
+            response_dict = response.model_dump(exclude_unset=True, exclude_none=True)
+            try:
+                choices = response_dict.get('choices')
+                if choices and isinstance(choices, list) and len(choices) > 0:
+                    message_dict = choices[0].get('message')
+                    if message_dict and isinstance(message_dict, dict):
+                        # Always remove extra_content from the message if it exists
+                        if 'extra_content' in message_dict:
+                            del message_dict['extra_content']
+                        # Extract reasoning from content
+                        full_content = message_dict.get('content')
+                        actual_content = full_content if isinstance(full_content, str) else ""
+                        if actual_content:
+                            print(f"INFO: OpenAI Direct Non-Streaming - Applying tag extraction with fixed marker: '{VERTEX_REASONING_TAG}'")
+                            reasoning_text, actual_content = extract_reasoning_by_tags(actual_content, VERTEX_REASONING_TAG)
+                            message_dict['content'] = actual_content
+                            if reasoning_text:
+                                message_dict['reasoning_content'] = reasoning_text
+                                print(f"DEBUG: Tag extraction success. Reasoning len: {len(reasoning_text)}, Content len: {len(actual_content)}")
+                            else:
+                                print(f"DEBUG: No content found within fixed tag '{VERTEX_REASONING_TAG}'.")
+                        else:
+                            print(f"WARNING: OpenAI Direct Non-Streaming - No initial content found in message.")
+                            message_dict['content'] = ""
+            except Exception as e_reasoning:
+                print(f"WARNING: Error during non-streaming reasoning processing for model {request.model}: {e_reasoning}")
+            return JSONResponse(content=response_dict)
+        except Exception as e:
+            error_msg = f"Error calling OpenAI client for {request.model}: {str(e)}"
+            print(f"ERROR: {error_msg}")
+            return JSONResponse(
+                status_code=500,
+                content=create_openai_error_response(500, error_msg, "server_error")
+            )
+    async def process_request(self, request: OpenAIRequest, base_model_name: str):
+        """Main entry point for processing OpenAI Direct mode requests."""
+        print(f"INFO: Using OpenAI Direct Path for model: {request.model}")
+        # Get credentials
+        rotated_credentials, rotated_project_id = self.credential_manager.get_credentials()
+        if not rotated_credentials or not rotated_project_id:
+            error_msg = "OpenAI Direct Mode requires GCP credentials, but none were available or loaded successfully."
+            print(f"ERROR: {error_msg}")
+            return JSONResponse(
+                status_code=500,
+                content=create_openai_error_response(500, error_msg, "server_error")
+            )
+        print(f"INFO: [OpenAI Direct Path] Using credentials for project: {rotated_project_id}")
+        gcp_token = _refresh_auth(rotated_credentials)
+        if not gcp_token:
+            error_msg = f"Failed to obtain valid GCP token for OpenAI client (Project: {rotated_project_id})."
+            print(f"ERROR: {error_msg}")
+            return JSONResponse(
+                status_code=500,
+                content=create_openai_error_response(500, error_msg, "server_error")
+            )
+        # Create client and prepare parameters
+        openai_client = self.create_openai_client(rotated_project_id, gcp_token)
+        model_id = f"google/{base_model_name}"
+        openai_params = self.prepare_openai_params(request, model_id)
+        openai_extra_body = self.prepare_extra_body()
+        # Handle streaming vs non-streaming
+        if request.stream:
+            return await self.handle_streaming_response(
+                openai_client, openai_params, openai_extra_body, request
+            )
+        else:
+            return await self.handle_non_streaming_response(
+                openai_client, openai_params, openai_extra_body, request
+            )