Spaces:

superxu520
/

G_AI

Paused

Lưu Quang Vũ Nativu5 commited on Nov 1, 2025

Commit

77830e0

unverified ·

1 Parent(s): fe5ae62

:sparkles: Add tool call support and fix image input and output (#47)

* Add tool_calls support

* Add tool_calls support

* Add tool_calls support

* Add tool_calls support

* Add tool_calls support

* Add endpoint generate and edit images

* Add endpoint generate and edit images

* Add support structured output

* Add support structured output

* Incorrect logging

* Force LLM to follow tool_call format

* Format the code using Black.

* Fixes "Error handling message: No image returned"

* Return image dimensions

* Fixes Pylance warning

* Format by ruff

* uv run directly

* Fixes XML_WRAP_HINT leaked

* Instruct an LLM to return code snippets enclosed within Markdown fenced code blocks.

* Adjust the streaming response logic to ensure important sections remain intact and are not fragmented.

* Change chunk_size to 64

* ruff check

* Fixes for im_start/im_end hints leaking from responses.

* Ensure all endpoints are fully compliant with OpenAI compatibility standards.

* :memo: Fix doc

---------

Co-authored-by: Nativu5 <44155313+Nativu5@users.noreply.github.com>

Files changed (7) hide show

app/models/models.py +182 -7
app/server/chat.py +975 -29
app/server/middleware.py +2 -1
app/services/client.py +111 -29
app/utils/config.py +2 -1
app/utils/helper.py +5 -1
run.py +3 -1

app/models/models.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from datetime import datetime
-from typing import Dict, List, Literal, Optional, Union
 from pydantic import BaseModel, Field
@@ -17,8 +19,9 @@ class Message(BaseModel):
     """Message model"""
     role: str
-    content: Union[str, List[ContentItem]]
     name: Optional[str] = None
 class Choice(BaseModel):
@@ -29,6 +32,49 @@ class Choice(BaseModel):
     finish_reason: str
 class Usage(BaseModel):
     """Usage statistics model"""
@@ -51,14 +97,16 @@ class ChatCompletionRequest(BaseModel):
     model: str
     messages: List[Message]
     temperature: Optional[float] = 0.7
     top_p: Optional[float] = 1.0
-    n: Optional[int] = 1
-    stream: Optional[bool] = False
     max_tokens: Optional[int] = None
-    presence_penalty: Optional[float] = 0
-    frequency_penalty: Optional[float] = 0
-    user: Optional[str] = None
 class ChatCompletionResponse(BaseModel):
@@ -101,3 +149,130 @@ class ConversationInStore(BaseModel):
         ..., description="Metadata for Gemini API to locate the conversation"
     )
     messages: list[Message] = Field(..., description="Message contents in the conversation")

+from __future__ import annotations
 from datetime import datetime
+from typing import Any, Dict, List, Literal, Optional, Union
 from pydantic import BaseModel, Field
     """Message model"""
     role: str
+    content: Union[str, List[ContentItem], None] = None
     name: Optional[str] = None
+    tool_calls: Optional[List["ToolCall"]] = None
 class Choice(BaseModel):
     finish_reason: str
+class FunctionCall(BaseModel):
+    """Function call payload"""
+    name: str
+    arguments: str
+class ToolCall(BaseModel):
+    """Tool call item"""
+    id: str
+    type: Literal["function"]
+    function: FunctionCall
+class ToolFunctionDefinition(BaseModel):
+    """Function definition for tool."""
+    name: str
+    description: Optional[str] = None
+    parameters: Optional[Dict[str, Any]] = None
+class Tool(BaseModel):
+    """Tool specification."""
+    type: Literal["function"]
+    function: ToolFunctionDefinition
+class ToolChoiceFunctionDetail(BaseModel):
+    """Detail of a tool choice function."""
+    name: str
+class ToolChoiceFunction(BaseModel):
+    """Tool choice forcing a specific function."""
+    type: Literal["function"]
+    function: ToolChoiceFunctionDetail
 class Usage(BaseModel):
     """Usage statistics model"""
     model: str
     messages: List[Message]
+    stream: Optional[bool] = False
+    user: Optional[str] = None
     temperature: Optional[float] = 0.7
     top_p: Optional[float] = 1.0
     max_tokens: Optional[int] = None
+    tools: Optional[List["Tool"]] = None
+    tool_choice: Optional[
+        Union[Literal["none"], Literal["auto"], Literal["required"], "ToolChoiceFunction"]
+    ] = None
+    response_format: Optional[Dict[str, Any]] = None
 class ChatCompletionResponse(BaseModel):
         ..., description="Metadata for Gemini API to locate the conversation"
     )
     messages: list[Message] = Field(..., description="Message contents in the conversation")
+class ResponseInputContent(BaseModel):
+    """Content item for Responses API input."""
+    type: Literal["input_text", "input_image"]
+    text: Optional[str] = None
+    image_url: Optional[str] = None
+    image_base64: Optional[str] = None
+    mime_type: Optional[str] = None
+class ResponseInputItem(BaseModel):
+    """Single input item for Responses API."""
+    type: Optional[Literal["message"]] = "message"
+    role: Literal["user", "assistant", "system", "developer"]
+    content: Union[str, List[ResponseInputContent]]
+class ResponseToolChoice(BaseModel):
+    """Tool choice enforcing a specific tool in Responses API."""
+    type: Literal["image_generation"]
+class ResponseImageTool(BaseModel):
+    """Image generation tool specification for Responses API."""
+    type: Literal["image_generation"]
+    model: Optional[str] = None
+    output_format: Optional[str] = None
+class ResponseCreateRequest(BaseModel):
+    """Responses API request payload."""
+    model: str
+    input: Union[str, List[ResponseInputItem]]
+    instructions: Optional[Union[str, List[ResponseInputItem]]] = None
+    temperature: Optional[float] = 0.7
+    top_p: Optional[float] = 1.0
+    max_output_tokens: Optional[int] = None
+    stream: Optional[bool] = False
+    tool_choice: Optional[ResponseToolChoice] = None
+    tools: Optional[List[ResponseImageTool]] = None
+    store: Optional[bool] = None
+    user: Optional[str] = None
+    response_format: Optional[Dict[str, Any]] = None
+    metadata: Optional[Dict[str, Any]] = None
+class ResponseUsage(BaseModel):
+    """Usage statistics for Responses API."""
+    input_tokens: int
+    output_tokens: int
+    total_tokens: int
+class ResponseOutputContent(BaseModel):
+    """Content item for Responses API output."""
+    type: Literal["output_text", "output_image"]
+    text: Optional[str] = None
+    image_base64: Optional[str] = None
+    mime_type: Optional[str] = None
+    width: Optional[int] = None
+    height: Optional[int] = None
+class ResponseOutputMessage(BaseModel):
+    """Assistant message returned by Responses API."""
+    id: str
+    type: Literal["message"]
+    role: Literal["assistant"]
+    content: List[ResponseOutputContent]
+class ResponseImageGenerationCall(BaseModel):
+    """Image generation call record emitted in Responses API."""
+    id: str
+    type: Literal["image_generation_call"] = "image_generation_call"
+    status: Literal["completed", "in_progress", "generating", "failed"] = "completed"
+    result: Optional[str] = None
+    output_format: Optional[str] = None
+    size: Optional[str] = None
+    revised_prompt: Optional[str] = None
+class ResponseToolCall(BaseModel):
+    """Tool call record emitted in Responses API."""
+    id: str
+    type: Literal["tool_call"] = "tool_call"
+    status: Literal["in_progress", "completed", "failed", "requires_action"] = "completed"
+    function: FunctionCall
+class ResponseCreateResponse(BaseModel):
+    """Responses API response payload."""
+    id: str
+    object: Literal["response"] = "response"
+    created: int
+    model: str
+    output: List[Union[ResponseOutputMessage, ResponseImageGenerationCall, ResponseToolCall]]
+    output_text: Optional[str] = None
+    status: Literal[
+        "in_progress",
+        "completed",
+        "failed",
+        "incomplete",
+        "requires_action",
+    ] = "completed"
+    usage: ResponseUsage
+    metadata: Optional[Dict[str, Any]] = None
+    system_fingerprint: Optional[str] = None
+    input: Optional[Union[str, List[ResponseInputItem]]] = None
+# Rebuild models with forward references
+Message.model_rebuild()
+ToolCall.model_rebuild()
+ChatCompletionRequest.model_rebuild()

app/server/chat.py CHANGED Viewed

@@ -1,26 +1,44 @@
 import uuid
 from datetime import datetime, timezone
 from pathlib import Path
 import orjson
 from fastapi import APIRouter, Depends, HTTPException, status
 from fastapi.responses import StreamingResponse
 from gemini_webapi.client import ChatSession
 from gemini_webapi.constants import Model
 from loguru import logger
 from ..models import (
     ChatCompletionRequest,
     ConversationInStore,
     Message,
     ModelData,
     ModelListResponse,
 )
-from ..services import (
-    GeminiClientPool,
-    GeminiClientWrapper,
-    LMDBConversationStore,
-)
 from ..utils import g_config
 from ..utils.helper import estimate_tokens
 from .middleware import get_temp_dir, verify_api_key
@@ -30,10 +48,396 @@ MAX_CHARS_PER_REQUEST = int(g_config.gemini.max_chars_per_request * 0.9)
 CONTINUATION_HINT = "\n(More messages to come, please reply with just 'ok.')"
 router = APIRouter()
 @router.get("/v1/models", response_model=ModelListResponse)
 async def list_models(api_key: str = Depends(verify_api_key)):
     now = int(datetime.now(tz=timezone.utc).timestamp())
@@ -71,29 +475,51 @@ async def create_chat_completion(
             detail="At least one message is required in the conversation.",
         )
     # Check if conversation is reusable
     session, client, remaining_messages = _find_reusable_session(db, pool, model, request.messages)
     if session:
-        # Prepare the model input depending on how many turns are missing.
-        if len(remaining_messages) == 1:
             model_input, files = await GeminiClientWrapper.process_message(
-                remaining_messages[0], tmp_dir, tagged=False
             )
         else:
             model_input, files = await GeminiClientWrapper.process_conversation(
-                remaining_messages, tmp_dir
             )
         logger.debug(
-            f"Reused session {session.metadata} - sending {len(remaining_messages)} new messages."
         )
     else:
         # Start a new session and concat messages into a single string
         try:
             client = pool.acquire()
             session = client.start_chat(model=model)
             model_input, files = await GeminiClientWrapper.process_conversation(
-                request.messages, tmp_dir
             )
         except ValueError as e:
             raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
@@ -114,12 +540,46 @@ async def create_chat_completion(
         raise
     # Format the response from API
-    model_output = GeminiClientWrapper.extract_output(response, include_thoughts=True)
-    stored_output = GeminiClientWrapper.extract_output(response, include_thoughts=False)
     # After formatting, persist the conversation to LMDB
     try:
-        last_message = Message(role="assistant", content=stored_output)
         cleaned_history = db.sanitize_assistant_messages(request.messages)
         conv = ConversationInStore(
             model=model.model_name,
@@ -138,7 +598,8 @@ async def create_chat_completion(
     timestamp = int(datetime.now(tz=timezone.utc).timestamp())
     if request.stream:
         return _create_streaming_response(
-            model_output,
             completion_id,
             timestamp,
             request.model,
@@ -146,17 +607,277 @@ async def create_chat_completion(
         )
     else:
         return _create_standard_response(
-            model_output, completion_id, timestamp, request.model, request.messages
         )
 def _text_from_message(message: Message) -> str:
     """Return text content from a message for token estimation."""
     if isinstance(message.content, str):
-        return message.content
-    return "\n".join(
-        item.text or "" for item in message.content if getattr(item, "type", "") == "text"
-    )
 def _find_reusable_session(
@@ -172,7 +893,7 @@ def _find_reusable_session(
     ---------
     When a reply was generated by *another* server instance, the local LMDB may
     only contain an older part of the conversation.  However, as long as we can
-    line-up **any** earlier assistant/system response, we can restore the
     corresponding Gemini session and replay the *remaining* turns locally
     (including that missing assistant reply and the subsequent user prompts).
@@ -248,8 +969,50 @@ async def _send_with_split(session: ChatSession, text: str, files: list[Path | s
     return await session.send_message(chunks[-1], files=files)
 def _create_streaming_response(
     model_output: str,
     completion_id: str,
     created_time: int,
     model: str,
@@ -259,8 +1022,10 @@ def _create_streaming_response(
     # Calculate token usage
     prompt_tokens = sum(estimate_tokens(_text_from_message(msg)) for msg in messages)
-    completion_tokens = estimate_tokens(model_output)
     total_tokens = prompt_tokens + completion_tokens
     async def generate_stream():
         # Send start event
@@ -274,9 +1039,7 @@ def _create_streaming_response(
         yield f"data: {orjson.dumps(data).decode('utf-8')}\n\n"
         # Stream output text in chunks for efficiency
-        chunk_size = 32
-        for i in range(0, len(model_output), chunk_size):
-            chunk = model_output[i : i + chunk_size]
             data = {
                 "id": completion_id,
                 "object": "chat.completion.chunk",
@@ -286,13 +1049,30 @@ def _create_streaming_response(
             }
             yield f"data: {orjson.dumps(data).decode('utf-8')}\n\n"
         # Send end event
         data = {
             "id": completion_id,
             "object": "chat.completion.chunk",
             "created": created_time,
             "model": model,
-            "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
             "usage": {
                 "prompt_tokens": prompt_tokens,
                 "completion_tokens": completion_tokens,
@@ -305,8 +1085,89 @@ def _create_streaming_response(
     return StreamingResponse(generate_stream(), media_type="text/event-stream")
 def _create_standard_response(
     model_output: str,
     completion_id: str,
     created_time: int,
     model: str,
@@ -315,8 +1176,14 @@ def _create_standard_response(
     """Create standard response"""
     # Calculate token usage
     prompt_tokens = sum(estimate_tokens(_text_from_message(msg)) for msg in messages)
-    completion_tokens = estimate_tokens(model_output)
     total_tokens = prompt_tokens + completion_tokens
     result = {
         "id": completion_id,
@@ -326,8 +1193,8 @@ def _create_standard_response(
         "choices": [
             {
                 "index": 0,
-                "message": {"role": "assistant", "content": model_output},
-                "finish_reason": "stop",
             }
         ],
         "usage": {
@@ -339,3 +1206,82 @@ def _create_standard_response(
     logger.debug(f"Response created with {total_tokens} total tokens")
     return result

+import base64
+import json
+import re
+import struct
 import uuid
+from dataclasses import dataclass
 from datetime import datetime, timezone
 from pathlib import Path
+from typing import Any, Iterator
 import orjson
 from fastapi import APIRouter, Depends, HTTPException, status
 from fastapi.responses import StreamingResponse
 from gemini_webapi.client import ChatSession
 from gemini_webapi.constants import Model
+from gemini_webapi.types.image import GeneratedImage, Image
 from loguru import logger
 from ..models import (
     ChatCompletionRequest,
+    ContentItem,
     ConversationInStore,
+    FunctionCall,
     Message,
     ModelData,
     ModelListResponse,
+    ResponseCreateRequest,
+    ResponseCreateResponse,
+    ResponseImageGenerationCall,
+    ResponseInputContent,
+    ResponseInputItem,
+    ResponseOutputContent,
+    ResponseOutputMessage,
+    ResponseToolCall,
+    ResponseUsage,
+    Tool,
+    ToolCall,
+    ToolChoiceFunction,
 )
+from ..services import GeminiClientPool, GeminiClientWrapper, LMDBConversationStore
+from ..services.client import CODE_BLOCK_HINT, XML_WRAP_HINT
 from ..utils import g_config
 from ..utils.helper import estimate_tokens
 from .middleware import get_temp_dir, verify_api_key
 CONTINUATION_HINT = "\n(More messages to come, please reply with just 'ok.')"
+TOOL_BLOCK_RE = re.compile(r"```xml\s*(.*?)```", re.DOTALL | re.IGNORECASE)
+TOOL_CALL_RE = re.compile(
+    r"<tool_call\s+name=\"([^\"]+)\">(.*?)</tool_call>", re.DOTALL | re.IGNORECASE
+)
+JSON_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL | re.IGNORECASE)
+CONTROL_TOKEN_RE = re.compile(r"<\|im_(?:start|end)\|>")
+XML_HINT_STRIPPED = XML_WRAP_HINT.strip()
+CODE_HINT_STRIPPED = CODE_BLOCK_HINT.strip()
 router = APIRouter()
+@dataclass
+class StructuredOutputRequirement:
+    """Represents a structured response request from the client."""
+    schema_name: str
+    schema: dict[str, Any]
+    instruction: str
+    raw_format: dict[str, Any]
+def _build_structured_requirement(
+    response_format: dict[str, Any] | None,
+) -> StructuredOutputRequirement | None:
+    """Translate OpenAI-style response_format into internal instructions."""
+    if not response_format or not isinstance(response_format, dict):
+        return None
+    if response_format.get("type") != "json_schema":
+        logger.warning(f"Unsupported response_format type requested: {response_format}")
+        return None
+    json_schema = response_format.get("json_schema")
+    if not isinstance(json_schema, dict):
+        logger.warning(f"Invalid json_schema payload in response_format: {response_format}")
+        return None
+    schema = json_schema.get("schema")
+    if not isinstance(schema, dict):
+        logger.warning(f"Missing `schema` object in response_format payload: {response_format}")
+        return None
+    schema_name = json_schema.get("name") or "response"
+    strict = json_schema.get("strict", True)
+    pretty_schema = json.dumps(schema, ensure_ascii=False, indent=2, sort_keys=True)
+    instruction_parts = [
+        "You must respond with a single valid JSON document that conforms to the schema shown below.",
+        "Do not include explanations, comments, or any text before or after the JSON.",
+        f'Schema name: "{schema_name}"',
+        "JSON Schema:",
+        pretty_schema,
+    ]
+    if not strict:
+        instruction_parts.insert(
+            1,
+            "The schema allows unspecified fields, but include only what is necessary to satisfy the user's request.",
+        )
+    instruction = "\n\n".join(instruction_parts)
+    return StructuredOutputRequirement(
+        schema_name=schema_name,
+        schema=schema,
+        instruction=instruction,
+        raw_format=response_format,
+    )
+def _strip_code_fence(text: str) -> str:
+    """Remove surrounding ```json fences if present."""
+    match = JSON_FENCE_RE.match(text.strip())
+    if match:
+        return match.group(1).strip()
+    return text.strip()
+def _build_tool_prompt(
+    tools: list[Tool],
+    tool_choice: str | ToolChoiceFunction | None,
+) -> str:
+    """Generate a system prompt chunk describing available tools."""
+    if not tools:
+        return ""
+    lines: list[str] = [
+        "You can invoke the following developer tools. Call a tool only when it is required and follow the JSON schema exactly when providing arguments."
+    ]
+    for tool in tools:
+        function = tool.function
+        description = function.description or "No description provided."
+        lines.append(f"Tool `{function.name}`: {description}")
+        if function.parameters:
+            schema_text = json.dumps(function.parameters, ensure_ascii=False, indent=2)
+            lines.append("Arguments JSON schema:")
+            lines.append(schema_text)
+        else:
+            lines.append("Arguments JSON schema: {}")
+    if tool_choice == "none":
+        lines.append(
+            "For this request you must not call any tool. Provide the best possible natural language answer."
+        )
+    elif tool_choice == "required":
+        lines.append(
+            "You must call at least one tool before responding to the user. Do not provide a final user-facing answer until a tool call has been issued."
+        )
+    elif isinstance(tool_choice, ToolChoiceFunction):
+        target = tool_choice.function.name
+        lines.append(
+            f"You are required to call the tool named `{target}`. Do not call any other tool."
+        )
+    # `auto` or None fall back to default instructions.
+    lines.append(
+        "When you decide to call a tool you MUST respond with nothing except a single fenced block exactly like the template below."
+    )
+    lines.append(
+        "The fenced block MUST use ```xml as the opening fence and ``` as the closing fence. Do not add text before or after it."
+    )
+    lines.append("```xml")
+    lines.append('<tool_call name="tool_name">{"argument": "value"}</tool_call>')
+    lines.append("```")
+    lines.append(
+        "Use double quotes for JSON keys and values. If you omit the fenced block or include any extra text, the system will assume you are NOT calling a tool and your request will fail."
+    )
+    lines.append(
+        "If multiple tool calls are required, include multiple <tool_call> entries inside the same fenced block. Without a tool call, reply normally and do NOT emit any ```xml fence."
+    )
+    return "\n".join(lines)
+def _append_xml_hint_to_last_user_message(messages: list[Message]) -> None:
+    """Ensure the last user message carries the XML wrap hint."""
+    for msg in reversed(messages):
+        if msg.role != "user" or msg.content is None:
+            continue
+        if isinstance(msg.content, str):
+            if XML_HINT_STRIPPED not in msg.content:
+                msg.content = f"{msg.content}{XML_WRAP_HINT}"
+            return
+        if isinstance(msg.content, list):
+            for part in reversed(msg.content):
+                if getattr(part, "type", None) != "text":
+                    continue
+                text_value = part.text or ""
+                if XML_HINT_STRIPPED in text_value:
+                    return
+                part.text = f"{text_value}{XML_WRAP_HINT}"
+                return
+            messages_text = XML_WRAP_HINT.strip()
+            msg.content.append(ContentItem(type="text", text=messages_text))
+            return
+    # No user message to annotate; nothing to do.
+def _conversation_has_code_hint(messages: list[Message]) -> bool:
+    """Return True if any system message already includes the code block hint."""
+    for msg in messages:
+        if msg.role != "system" or msg.content is None:
+            continue
+        if isinstance(msg.content, str):
+            if CODE_HINT_STRIPPED in msg.content:
+                return True
+            continue
+        if isinstance(msg.content, list):
+            for part in msg.content:
+                if getattr(part, "type", None) != "text":
+                    continue
+                if part.text and CODE_HINT_STRIPPED in part.text:
+                    return True
+    return False
+def _prepare_messages_for_model(
+    source_messages: list[Message],
+    tools: list[Tool] | None,
+    tool_choice: str | ToolChoiceFunction | None,
+    extra_instructions: list[str] | None = None,
+) -> list[Message]:
+    """Return a copy of messages enriched with tool instructions when needed."""
+    prepared = [msg.model_copy(deep=True) for msg in source_messages]
+    instructions: list[str] = []
+    if tools:
+        tool_prompt = _build_tool_prompt(tools, tool_choice)
+        if tool_prompt:
+            instructions.append(tool_prompt)
+    if extra_instructions:
+        instructions.extend(instr for instr in extra_instructions if instr)
+        logger.debug(
+            f"Applied {len(extra_instructions)} extra instructions for tool/structured output."
+        )
+    if not _conversation_has_code_hint(prepared):
+        instructions.append(CODE_BLOCK_HINT)
+        logger.debug("Injected default code block hint for Gemini conversation.")
+    if not instructions:
+        return prepared
+    combined_instructions = "\n\n".join(instructions)
+    if prepared and prepared[0].role == "system" and isinstance(prepared[0].content, str):
+        existing = prepared[0].content or ""
+        separator = "\n\n" if existing else ""
+        prepared[0].content = f"{existing}{separator}{combined_instructions}"
+    else:
+        prepared.insert(0, Message(role="system", content=combined_instructions))
+    if tools and tool_choice != "none":
+        _append_xml_hint_to_last_user_message(prepared)
+    return prepared
+def _strip_system_hints(text: str) -> str:
+    """Remove system-level hint text from a given string."""
+    if not text:
+        return text
+    cleaned = text.replace(XML_WRAP_HINT, "").replace(XML_HINT_STRIPPED, "")
+    cleaned = cleaned.replace(CODE_BLOCK_HINT, "").replace(CODE_HINT_STRIPPED, "")
+    cleaned = CONTROL_TOKEN_RE.sub("", cleaned)
+    return cleaned.strip()
+def _ensure_data_url(part: ResponseInputContent) -> str | None:
+    image_url = part.image_url
+    if not image_url and part.image_base64:
+        mime_type = part.mime_type or "image/png"
+        image_url = f"data:{mime_type};base64,{part.image_base64}"
+    return image_url
+def _response_items_to_messages(
+    items: str | list[ResponseInputItem],
+) -> tuple[list[Message], str | list[ResponseInputItem]]:
+    """Convert Responses API input items into internal Message objects and normalized input."""
+    messages: list[Message] = []
+    if isinstance(items, str):
+        messages.append(Message(role="user", content=items))
+        logger.debug("Normalized Responses input: single string message.")
+        return messages, items
+    normalized_input: list[ResponseInputItem] = []
+    for item in items:
+        role = item.role
+        if role == "developer":
+            role = "system"
+        content = item.content
+        normalized_contents: list[ResponseInputContent] = []
+        if isinstance(content, str):
+            normalized_contents.append(ResponseInputContent(type="input_text", text=content))
+            messages.append(Message(role=role, content=content))
+        else:
+            converted: list[ContentItem] = []
+            for part in content:
+                if part.type == "input_text":
+                    text_value = part.text or ""
+                    normalized_contents.append(
+                        ResponseInputContent(type="input_text", text=text_value)
+                    )
+                    if text_value:
+                        converted.append(ContentItem(type="text", text=text_value))
+                elif part.type == "input_image":
+                    image_url = _ensure_data_url(part)
+                    if image_url:
+                        normalized_contents.append(
+                            ResponseInputContent(type="input_image", image_url=image_url)
+                        )
+                        converted.append(
+                            ContentItem(type="image_url", image_url={"url": image_url})
+                        )
+            messages.append(Message(role=role, content=converted or None))
+        normalized_input.append(
+            ResponseInputItem(type="message", role=item.role, content=normalized_contents or [])
+        )
+    logger.debug(
+        f"Normalized Responses input: {len(normalized_input)} message items (developer roles mapped to system)."
+    )
+    return messages, normalized_input
+def _instructions_to_messages(
+    instructions: str | list[ResponseInputItem] | None,
+) -> list[Message]:
+    """Normalize instructions payload into Message objects."""
+    if not instructions:
+        return []
+    if isinstance(instructions, str):
+        return [Message(role="system", content=instructions)]
+    instruction_messages: list[Message] = []
+    for item in instructions:
+        if item.type and item.type != "message":
+            continue
+        role = item.role
+        if role == "developer":
+            role = "system"
+        content = item.content
+        if isinstance(content, str):
+            instruction_messages.append(Message(role=role, content=content))
+        else:
+            converted: list[ContentItem] = []
+            for part in content:
+                if part.type == "input_text":
+                    text_value = part.text or ""
+                    if text_value:
+                        converted.append(ContentItem(type="text", text=text_value))
+                elif part.type == "input_image":
+                    image_url = _ensure_data_url(part)
+                    if image_url:
+                        converted.append(
+                            ContentItem(type="image_url", image_url={"url": image_url})
+                        )
+            instruction_messages.append(Message(role=role, content=converted or None))
+    return instruction_messages
+def _remove_tool_call_blocks(text: str) -> str:
+    """Strip tool call code blocks from text."""
+    if not text:
+        return text
+    cleaned = TOOL_BLOCK_RE.sub("", text)
+    return _strip_system_hints(cleaned)
+def _extract_tool_calls(text: str) -> tuple[str, list[ToolCall]]:
+    """Extract tool call definitions and return cleaned text."""
+    if not text:
+        return text, []
+    tool_calls: list[ToolCall] = []
+    def _replace(match: re.Match[str]) -> str:
+        block_content = match.group(1)
+        if not block_content:
+            return ""
+        for call_match in TOOL_CALL_RE.finditer(block_content):
+            name = (call_match.group(1) or "").strip()
+            raw_args = (call_match.group(2) or "").strip()
+            if not name:
+                logger.warning(
+                    f"Encountered tool_call block without a function name: {block_content}"
+                )
+                continue
+            arguments = raw_args
+            try:
+                parsed_args = json.loads(raw_args)
+                arguments = json.dumps(parsed_args, ensure_ascii=False)
+            except json.JSONDecodeError:
+                logger.warning(
+                    f"Failed to parse tool call arguments for '{name}'. Passing raw string."
+                )
+            tool_calls.append(
+                ToolCall(
+                    id=f"call_{uuid.uuid4().hex}",
+                    type="function",
+                    function=FunctionCall(name=name, arguments=arguments),
+                )
+            )
+        return ""
+    cleaned = TOOL_BLOCK_RE.sub(_replace, text)
+    cleaned = _strip_system_hints(cleaned)
+    return cleaned, tool_calls
 @router.get("/v1/models", response_model=ModelListResponse)
 async def list_models(api_key: str = Depends(verify_api_key)):
     now = int(datetime.now(tz=timezone.utc).timestamp())
             detail="At least one message is required in the conversation.",
         )
+    structured_requirement = _build_structured_requirement(request.response_format)
+    if structured_requirement and request.stream:
+        logger.debug(
+            "Structured response requested with streaming enabled; will stream canonical JSON once ready."
+        )
+    if structured_requirement:
+        logger.debug(
+            f"Structured response requested for /v1/chat/completions (schema={structured_requirement.schema_name})."
+        )
+    extra_instructions = [structured_requirement.instruction] if structured_requirement else None
     # Check if conversation is reusable
     session, client, remaining_messages = _find_reusable_session(db, pool, model, request.messages)
     if session:
+        messages_to_send = _prepare_messages_for_model(
+            remaining_messages, request.tools, request.tool_choice, extra_instructions
+        )
+        if not messages_to_send:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="No new messages to send for the existing session.",
+            )
+        if len(messages_to_send) == 1:
             model_input, files = await GeminiClientWrapper.process_message(
+                messages_to_send[0], tmp_dir, tagged=False
             )
         else:
             model_input, files = await GeminiClientWrapper.process_conversation(
+                messages_to_send, tmp_dir
             )
         logger.debug(
+            f"Reused session {session.metadata} - sending {len(messages_to_send)} prepared messages."
         )
     else:
         # Start a new session and concat messages into a single string
         try:
             client = pool.acquire()
             session = client.start_chat(model=model)
+            messages_to_send = _prepare_messages_for_model(
+                request.messages, request.tools, request.tool_choice, extra_instructions
+            )
             model_input, files = await GeminiClientWrapper.process_conversation(
+                messages_to_send, tmp_dir
             )
         except ValueError as e:
             raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
         raise
     # Format the response from API
+    raw_output_with_think = GeminiClientWrapper.extract_output(response, include_thoughts=True)
+    raw_output_clean = GeminiClientWrapper.extract_output(response, include_thoughts=False)
+    visible_output, tool_calls = _extract_tool_calls(raw_output_with_think)
+    storage_output = _remove_tool_call_blocks(raw_output_clean).strip()
+    tool_calls_payload = [call.model_dump(mode="json") for call in tool_calls]
+    if structured_requirement:
+        cleaned_visible = _strip_code_fence(visible_output or "")
+        if not cleaned_visible:
+            raise HTTPException(
+                status_code=status.HTTP_502_BAD_GATEWAY,
+                detail="LLM returned an empty response while JSON schema output was requested.",
+            )
+        try:
+            structured_payload = json.loads(cleaned_visible)
+        except json.JSONDecodeError as exc:
+            logger.warning(
+                f"Failed to decode JSON for structured response (schema={structured_requirement.schema_name}): "
+                f"{cleaned_visible}"
+            )
+            raise HTTPException(
+                status_code=status.HTTP_502_BAD_GATEWAY,
+                detail="LLM returned invalid JSON for the requested response_format.",
+            ) from exc
+        canonical_output = json.dumps(structured_payload, ensure_ascii=False)
+        visible_output = canonical_output
+        storage_output = canonical_output
+    if tool_calls_payload:
+        logger.debug(f"Detected tool calls: {tool_calls_payload}")
     # After formatting, persist the conversation to LMDB
     try:
+        last_message = Message(
+            role="assistant",
+            content=storage_output or None,
+            tool_calls=tool_calls or None,
+        )
         cleaned_history = db.sanitize_assistant_messages(request.messages)
         conv = ConversationInStore(
             model=model.model_name,
     timestamp = int(datetime.now(tz=timezone.utc).timestamp())
     if request.stream:
         return _create_streaming_response(
+            visible_output,
+            tool_calls_payload,
             completion_id,
             timestamp,
             request.model,
         )
     else:
         return _create_standard_response(
+            visible_output,
+            tool_calls_payload,
+            completion_id,
+            timestamp,
+            request.model,
+            request.messages,
         )
+@router.post("/v1/responses")
+async def create_response(
+    request: ResponseCreateRequest,
+    api_key: str = Depends(verify_api_key),
+    tmp_dir: Path = Depends(get_temp_dir),
+):
+    messages, normalized_input = _response_items_to_messages(request.input)
+    if not messages:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST, detail="No message input provided."
+        )
+    structured_requirement = _build_structured_requirement(request.response_format)
+    if structured_requirement and request.stream:
+        logger.debug(
+            "Structured response requested with streaming enabled; streaming not supported for Responses."
+        )
+    preface_messages = _instructions_to_messages(request.instructions)
+    if structured_requirement:
+        preface_messages.insert(
+            0, Message(role="system", content=structured_requirement.instruction)
+        )
+        logger.debug(
+            f"Structured response requested for /v1/responses (schema={structured_requirement.schema_name})."
+        )
+    if preface_messages:
+        messages = [*preface_messages, *messages]
+        logger.debug(
+            f"Injected {len(preface_messages)} instruction messages before sending to Gemini."
+        )
+    pool = GeminiClientPool()
+    db = LMDBConversationStore()
+    try:
+        model = Model.from_name(request.model)
+    except ValueError as exc:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc)) from exc
+    session, client, remaining_messages = _find_reusable_session(db, pool, model, messages)
+    if session:
+        messages_to_send = remaining_messages
+        if not messages_to_send:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="No new messages to send for the existing session.",
+            )
+        if len(messages_to_send) == 1:
+            model_input, files = await GeminiClientWrapper.process_message(
+                messages_to_send[0], tmp_dir, tagged=False
+            )
+        else:
+            model_input, files = await GeminiClientWrapper.process_conversation(
+                messages_to_send, tmp_dir
+            )
+        logger.debug(
+            f"Reused session {session.metadata} - sending {len(messages_to_send)} prepared messages."
+        )
+    else:
+        try:
+            client = pool.acquire()
+            session = client.start_chat(model=model)
+            model_input, files = await GeminiClientWrapper.process_conversation(messages, tmp_dir)
+        except ValueError as e:
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
+        except Exception as e:
+            logger.exception(f"Error in preparing conversation for responses API: {e}")
+            raise
+        logger.debug("New session started for /v1/responses request.")
+    try:
+        assert session and client, "Session and client not available"
+        logger.debug(
+            f"Client ID: {client.id}, Input length: {len(model_input)}, files count: {len(files)}"
+        )
+        model_output = await _send_with_split(session, model_input, files=files)
+    except Exception as e:
+        logger.exception(f"Error generating content from Gemini API for responses: {e}")
+        raise
+    text_with_think = GeminiClientWrapper.extract_output(model_output, include_thoughts=True)
+    text_without_think = GeminiClientWrapper.extract_output(model_output, include_thoughts=False)
+    visible_text, detected_tool_calls = _extract_tool_calls(text_with_think)
+    storage_output = _remove_tool_call_blocks(text_without_think).strip()
+    assistant_text = LMDBConversationStore.remove_think_tags(visible_text.strip())
+    if structured_requirement:
+        cleaned_visible = _strip_code_fence(assistant_text or "")
+        if not cleaned_visible:
+            raise HTTPException(
+                status_code=status.HTTP_502_BAD_GATEWAY,
+                detail="LLM returned an empty response while JSON schema output was requested.",
+            )
+        try:
+            structured_payload = json.loads(cleaned_visible)
+        except json.JSONDecodeError as exc:
+            logger.warning(
+                f"Failed to decode JSON for structured response (schema={structured_requirement.schema_name}): "
+                f"{cleaned_visible}"
+            )
+            raise HTTPException(
+                status_code=status.HTTP_502_BAD_GATEWAY,
+                detail="LLM returned invalid JSON for the requested response_format.",
+            ) from exc
+        canonical_output = json.dumps(structured_payload, ensure_ascii=False)
+        assistant_text = canonical_output
+        storage_output = canonical_output
+        logger.debug(
+            f"Structured response fulfilled for /v1/responses (schema={structured_requirement.schema_name})."
+        )
+    expects_image = (
+        request.tool_choice is not None and request.tool_choice.type == "image_generation"
+    )
+    if expects_image and not model_output.images:
+        summary = assistant_text.strip() if assistant_text else ""
+        if summary:
+            summary = re.sub(r"\s+", " ", summary)
+            if len(summary) > 200:
+                summary = f"{summary[:197]}..."
+        logger.warning(
+            "Image generation was requested via tool_choice but Gemini returned no images."
+        )
+        detail = "LLM returned no images for the requested image_generation tool."
+        if summary:
+            detail = f"{detail} Assistant response: {summary}"
+        raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY, detail=detail)
+    image_contents: list[ResponseOutputContent] = []
+    image_call_items: list[ResponseImageGenerationCall] = []
+    for image in model_output.images:
+        try:
+            image_base64, width, height = await _image_to_base64(image, tmp_dir)
+        except Exception as exc:
+            logger.warning(f"Failed to download generated image: {exc}")
+            continue
+        mime_type = "image/png" if isinstance(image, GeneratedImage) else "image/jpeg"
+        image_contents.append(
+            ResponseOutputContent(
+                type="output_image",
+                image_base64=image_base64,
+                mime_type=mime_type,
+                width=width,
+                height=height,
+            )
+        )
+        image_call_items.append(
+            ResponseImageGenerationCall(
+                id=f"img_{uuid.uuid4().hex}",
+                status="completed",
+                result=image_base64,
+                output_format="png" if isinstance(image, GeneratedImage) else "jpeg",
+                size=f"{width}x{height}" if width and height else None,
+            )
+        )
+    tool_call_items: list[ResponseToolCall] = []
+    if detected_tool_calls:
+        tool_call_items = [
+            ResponseToolCall(
+                id=call.id,
+                status="completed",
+                function=call.function,
+            )
+            for call in detected_tool_calls
+        ]
+    response_contents: list[ResponseOutputContent] = []
+    if assistant_text:
+        response_contents.append(ResponseOutputContent(type="output_text", text=assistant_text))
+    response_contents.extend(image_contents)
+    if not response_contents:
+        response_contents.append(ResponseOutputContent(type="output_text", text=""))
+    created_time = int(datetime.now(tz=timezone.utc).timestamp())
+    response_id = f"resp_{uuid.uuid4().hex}"
+    message_id = f"msg_{uuid.uuid4().hex}"
+    input_tokens = sum(estimate_tokens(_text_from_message(msg)) for msg in messages)
+    tool_arg_text = "".join(call.function.arguments or "" for call in detected_tool_calls)
+    completion_basis = assistant_text or ""
+    if tool_arg_text:
+        completion_basis = (
+            f"{completion_basis}\n{tool_arg_text}" if completion_basis else tool_arg_text
+        )
+    output_tokens = estimate_tokens(completion_basis)
+    usage = ResponseUsage(
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        total_tokens=input_tokens + output_tokens,
+    )
+    response_payload = ResponseCreateResponse(
+        id=response_id,
+        created=created_time,
+        model=request.model,
+        output=[
+            ResponseOutputMessage(
+                id=message_id,
+                type="message",
+                role="assistant",
+                content=response_contents,
+            ),
+            *tool_call_items,
+            *image_call_items,
+        ],
+        output_text=assistant_text or None,
+        status="completed",
+        usage=usage,
+        input=normalized_input or None,
+        metadata=request.metadata or None,
+    )
+    try:
+        last_message = Message(
+            role="assistant",
+            content=storage_output or None,
+            tool_calls=detected_tool_calls or None,
+        )
+        cleaned_history = db.sanitize_assistant_messages(messages)
+        conv = ConversationInStore(
+            model=model.model_name,
+            client_id=client.id,
+            metadata=session.metadata,
+            messages=[*cleaned_history, last_message],
+        )
+        key = db.store(conv)
+        logger.debug(f"Conversation saved to LMDB with key: {key}")
+    except Exception as exc:
+        logger.warning(f"Failed to save Responses conversation to LMDB: {exc}")
+    if request.stream:
+        logger.debug(
+            f"Streaming Responses API payload (response_id={response_payload.id}, text_chunks={bool(assistant_text)})."
+        )
+        return _create_responses_streaming_response(response_payload, assistant_text or "")
+    return response_payload
 def _text_from_message(message: Message) -> str:
     """Return text content from a message for token estimation."""
+    base_text = ""
     if isinstance(message.content, str):
+        base_text = message.content
+    elif isinstance(message.content, list):
+        base_text = "\n".join(
+            item.text or "" for item in message.content if getattr(item, "type", "") == "text"
+        )
+    elif message.content is None:
+        base_text = ""
+    if message.tool_calls:
+        tool_arg_text = "".join(call.function.arguments or "" for call in message.tool_calls)
+        base_text = f"{base_text}\n{tool_arg_text}" if base_text else tool_arg_text
+    return base_text
 def _find_reusable_session(
     ---------
     When a reply was generated by *another* server instance, the local LMDB may
     only contain an older part of the conversation.  However, as long as we can
+    line up **any** earlier assistant/system response, we can restore the
     corresponding Gemini session and replay the *remaining* turns locally
     (including that missing assistant reply and the subsequent user prompts).
     return await session.send_message(chunks[-1], files=files)
+def _iter_stream_segments(model_output: str, chunk_size: int = 64):
+    """Yield stream segments while keeping <think> markers and words intact."""
+    if not model_output:
+        return
+    token_pattern = re.compile(r"\s+|\S+\s*")
+    pending = ""
+    def _flush_pending() -> Iterator[str]:
+        nonlocal pending
+        if pending:
+            yield pending
+            pending = ""
+    # Split on <think> boundaries so the markers are never fragmented.
+    parts = re.split(r"(</?think>)", model_output)
+    for part in parts:
+        if not part:
+            continue
+        if part in {"<think>", "</think>"}:
+            yield from _flush_pending()
+            yield part
+            continue
+        for match in token_pattern.finditer(part):
+            token = match.group(0)
+            if len(token) > chunk_size:
+                yield from _flush_pending()
+                for idx in range(0, len(token), chunk_size):
+                    yield token[idx : idx + chunk_size]
+                continue
+            if pending and len(pending) + len(token) > chunk_size:
+                yield from _flush_pending()
+            pending += token
+    yield from _flush_pending()
 def _create_streaming_response(
     model_output: str,
+    tool_calls: list[dict],
     completion_id: str,
     created_time: int,
     model: str,
     # Calculate token usage
     prompt_tokens = sum(estimate_tokens(_text_from_message(msg)) for msg in messages)
+    tool_args = "".join(call.get("function", {}).get("arguments", "") for call in tool_calls or [])
+    completion_tokens = estimate_tokens(model_output + tool_args)
     total_tokens = prompt_tokens + completion_tokens
+    finish_reason = "tool_calls" if tool_calls else "stop"
     async def generate_stream():
         # Send start event
         yield f"data: {orjson.dumps(data).decode('utf-8')}\n\n"
         # Stream output text in chunks for efficiency
+        for chunk in _iter_stream_segments(model_output):
             data = {
                 "id": completion_id,
                 "object": "chat.completion.chunk",
             }
             yield f"data: {orjson.dumps(data).decode('utf-8')}\n\n"
+        if tool_calls:
+            tool_calls_delta = [{**call, "index": idx} for idx, call in enumerate(tool_calls)]
+            data = {
+                "id": completion_id,
+                "object": "chat.completion.chunk",
+                "created": created_time,
+                "model": model,
+                "choices": [
+                    {
+                        "index": 0,
+                        "delta": {"tool_calls": tool_calls_delta},
+                        "finish_reason": None,
+                    }
+                ],
+            }
+            yield f"data: {orjson.dumps(data).decode('utf-8')}\n\n"
         # Send end event
         data = {
             "id": completion_id,
             "object": "chat.completion.chunk",
             "created": created_time,
             "model": model,
+            "choices": [{"index": 0, "delta": {}, "finish_reason": finish_reason}],
             "usage": {
                 "prompt_tokens": prompt_tokens,
                 "completion_tokens": completion_tokens,
     return StreamingResponse(generate_stream(), media_type="text/event-stream")
+def _create_responses_streaming_response(
+    response_payload: ResponseCreateResponse,
+    assistant_text: str | None,
+) -> StreamingResponse:
+    """Create streaming response for Responses API using event types defined by OpenAI."""
+    response_dict = response_payload.model_dump(mode="json")
+    response_id = response_payload.id
+    created_time = response_payload.created
+    model = response_payload.model
+    logger.debug(
+        f"Preparing streaming envelope for /v1/responses (response_id={response_id}, model={model})."
+    )
+    base_event = {
+        "id": response_id,
+        "object": "response",
+        "created": created_time,
+        "model": model,
+    }
+    created_snapshot: dict[str, Any] = {
+        "id": response_id,
+        "object": "response",
+        "created": created_time,
+        "model": model,
+        "status": "in_progress",
+    }
+    if response_dict.get("metadata") is not None:
+        created_snapshot["metadata"] = response_dict["metadata"]
+    if response_dict.get("input") is not None:
+        created_snapshot["input"] = response_dict["input"]
+    async def generate_stream():
+        # Emit creation event
+        data = {
+            **base_event,
+            "type": "response.created",
+            "response": created_snapshot,
+        }
+        yield f"data: {orjson.dumps(data).decode('utf-8')}\n\n"
+        # Stream textual content, if any
+        if assistant_text:
+            for chunk in _iter_stream_segments(assistant_text):
+                delta_event = {
+                    **base_event,
+                    "type": "response.output_text.delta",
+                    "output_index": 0,
+                    "delta": chunk,
+                }
+                yield f"data: {orjson.dumps(delta_event).decode('utf-8')}\n\n"
+            done_event = {
+                **base_event,
+                "type": "response.output_text.done",
+                "output_index": 0,
+            }
+            yield f"data: {orjson.dumps(done_event).decode('utf-8')}\n\n"
+        else:
+            done_event = {
+                **base_event,
+                "type": "response.output_text.done",
+                "output_index": 0,
+            }
+            yield f"data: {orjson.dumps(done_event).decode('utf-8')}\n\n"
+        # Emit completed event with full payload
+        completed_event = {
+            **base_event,
+            "type": "response.completed",
+            "response": response_dict,
+        }
+        yield f"data: {orjson.dumps(completed_event).decode('utf-8')}\n\n"
+        yield "data: [DONE]\n\n"
+    return StreamingResponse(generate_stream(), media_type="text/event-stream")
 def _create_standard_response(
     model_output: str,
+    tool_calls: list[dict],
     completion_id: str,
     created_time: int,
     model: str,
     """Create standard response"""
     # Calculate token usage
     prompt_tokens = sum(estimate_tokens(_text_from_message(msg)) for msg in messages)
+    tool_args = "".join(call.get("function", {}).get("arguments", "") for call in tool_calls or [])
+    completion_tokens = estimate_tokens(model_output + tool_args)
     total_tokens = prompt_tokens + completion_tokens
+    finish_reason = "tool_calls" if tool_calls else "stop"
+    message_payload: dict = {"role": "assistant", "content": model_output or None}
+    if tool_calls:
+        message_payload["tool_calls"] = tool_calls
     result = {
         "id": completion_id,
         "choices": [
             {
                 "index": 0,
+                "message": message_payload,
+                "finish_reason": finish_reason,
             }
         ],
         "usage": {
     logger.debug(f"Response created with {total_tokens} total tokens")
     return result
+def _extract_image_dimensions(data: bytes) -> tuple[int | None, int | None]:
+    """Return image dimensions (width, height) if PNG or JPEG headers are present."""
+    # PNG: dimensions stored in bytes 16..24 of the IHDR chunk
+    if len(data) >= 24 and data.startswith(b"\x89PNG\r\n\x1a\n"):
+        try:
+            width, height = struct.unpack(">II", data[16:24])
+            return int(width), int(height)
+        except struct.error:
+            return None, None
+    # JPEG: dimensions stored in SOF segment; iterate through markers to locate it
+    if len(data) >= 4 and data[0:2] == b"\xff\xd8":
+        idx = 2
+        length = len(data)
+        sof_markers = {
+            0xC0,
+            0xC1,
+            0xC2,
+            0xC3,
+            0xC5,
+            0xC6,
+            0xC7,
+            0xC9,
+            0xCA,
+            0xCB,
+            0xCD,
+            0xCE,
+            0xCF,
+        }
+        while idx < length:
+            # Find marker alignment (markers are prefixed with 0xFF bytes)
+            if data[idx] != 0xFF:
+                idx += 1
+                continue
+            while idx < length and data[idx] == 0xFF:
+                idx += 1
+            if idx >= length:
+                break
+            marker = data[idx]
+            idx += 1
+            if marker in (0xD8, 0xD9, 0x01) or 0xD0 <= marker <= 0xD7:
+                continue
+            if idx + 1 >= length:
+                break
+            segment_length = (data[idx] << 8) + data[idx + 1]
+            idx += 2
+            if segment_length < 2:
+                break
+            if marker in sof_markers:
+                if idx + 4 < length:
+                    # Skip precision byte at idx, then read height/width (big-endian)
+                    height = (data[idx + 1] << 8) + data[idx + 2]
+                    width = (data[idx + 3] << 8) + data[idx + 4]
+                    return int(width), int(height)
+                break
+            idx += segment_length - 2
+    return None, None
+async def _image_to_base64(image: Image, temp_dir: Path) -> tuple[str, int | None, int | None]:
+    """Persist an image provided by gemini_webapi and return base64 plus dimensions."""
+    if isinstance(image, GeneratedImage):
+        saved_path = await image.save(path=str(temp_dir), full_size=True)
+    else:
+        saved_path = await image.save(path=str(temp_dir))
+    if not saved_path:
+        raise ValueError("Failed to save generated image")
+    data = Path(saved_path).read_bytes()
+    width, height = _extract_image_dimensions(data)
+    return base64.b64encode(data).decode("utf-8"), width, height

app/server/middleware.py CHANGED Viewed

@@ -17,7 +17,8 @@ def global_exception_handler(request: Request, exc: Exception):
         )
     return ORJSONResponse(
-        status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, content={"error": {"message": str(exc)}}
     )

         )
     return ORJSONResponse(
+        status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+        content={"error": {"message": str(exc)}},
     )

app/services/client.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import asyncio
 import re
 from pathlib import Path
 from gemini_webapi import GeminiClient, ModelOutput
 from gemini_webapi.client import ChatSession
@@ -12,7 +15,28 @@ from ..models import Message
 from ..utils import g_config
 from ..utils.helper import add_tag, save_file_to_tempfile, save_url_to_tempfile
-XML_WRAP_HINT = "\nFor any xml block, e.g. tool call, always wrap it with: \n`````xml\n...\n`````\n"
 class GeminiClientWrapper(GeminiClient):
@@ -22,16 +46,32 @@ class GeminiClientWrapper(GeminiClient):
         super().__init__(**kwargs)
         self.id = client_id
-    async def init(self, **kwargs):
         """
         Inject default configuration values.
         """
-        kwargs.setdefault("timeout", g_config.gemini.timeout)
-        kwargs.setdefault("auto_refresh", g_config.gemini.auto_refresh)
-        kwargs.setdefault("verbose", g_config.gemini.verbose)
-        kwargs.setdefault("refresh_interval", g_config.gemini.refresh_interval)
-        await super().init(**kwargs)
     async def generate_content(
         self,
@@ -41,22 +81,23 @@ class GeminiClientWrapper(GeminiClient):
         gem: Gem | str | None = None,
         chat: ChatSession | None = None,
         **kwargs,
-    ):
         cnt = 2  # Try 2 times before giving up
-        last_exception = None
         while cnt:
             cnt -= 1
             try:
                 return await super().generate_content(prompt, files, model, gem, chat, **kwargs)
             except ModelInvalid as e:
-                # This is not always caused by model selection. Instead it can be solved by retrying.
                 # So we catch it and retry as a workaround.
                 await asyncio.sleep(1)
                 last_exception = e
         # If retrying failed, re-raise ModelInvalid
-        if last_exception:
             raise last_exception
     @staticmethod
     async def process_message(
@@ -65,22 +106,21 @@ class GeminiClientWrapper(GeminiClient):
         """
         Process a single message and return model input.
         """
-        model_input = ""
         files: list[Path | str] = []
         if isinstance(message.content, str):
             # Pure text content
-            model_input = message.content
-        else:
             # Mixed content
             # TODO: Use Pydantic to enforce the value checking
             for item in message.content:
                 if item.type == "text":
                     # Append multiple text fragments
                     if item.text:
-                        if model_input:
-                            model_input += "\n" + item.text
-                        else:
-                            model_input = item.text
                 elif item.type == "image_url":
                     if not item.image_url:
@@ -98,20 +138,33 @@ class GeminiClientWrapper(GeminiClient):
                         files.append(await save_file_to_tempfile(file_data, filename, tempdir))
                     else:
                         raise ValueError("File must contain 'file_data' key")
-        # This is a workaround for Gemini Web's displaying issues with XML blocks.
-        # Add this for tool calling
-        if re.search(r"<\s*[^>]+>", model_input):
-            hint = XML_WRAP_HINT
-        else:
-            hint = ""
         # Add role tag if needed
         if model_input:
             if tagged:
-                model_input = add_tag(message.role, model_input + hint)
-            else:
-                model_input += hint
         return model_input, files
@@ -161,7 +214,36 @@ class GeminiClientWrapper(GeminiClient):
             text += str(response)
         # Fix some escaped characters
-        text = text.replace("&lt;", "<").replace("\\<", "<").replace("\\_", "_").replace("\\>", ">")
         def simplify_link_target(text_content: str) -> str:
             match_colon_num = re.match(r"([^:]+:\d+)", text_content)
@@ -181,7 +263,7 @@ class GeminiClientWrapper(GeminiClient):
             else:
                 return new_link_segment
-        # Replace Google search links with simplified markdown links
         pattern = r"(\()?\[`([^`]+?)`\]\((https://www.google.com/search\?q=)(.*?)(?<!\\)\)\)*(\))?"
         text = re.sub(pattern, replacer, text)

 import asyncio
+import html
+import json
 import re
 from pathlib import Path
+from typing import Any, cast
 from gemini_webapi import GeminiClient, ModelOutput
 from gemini_webapi.client import ChatSession
 from ..utils import g_config
 from ..utils.helper import add_tag, save_file_to_tempfile, save_url_to_tempfile
+XML_WRAP_HINT = (
+    "\nYou MUST wrap every tool call response inside a single fenced block exactly like:\n"
+    '```xml\n<tool_call name="tool_name">{"arg": "value"}</tool_call>\n```\n'
+    "Do not surround the fence with any other text or whitespace; otherwise the call will be ignored.\n"
+)
+CODE_BLOCK_HINT = (
+    "\nWhenever you include code, markup, or shell snippets, wrap each snippet in a Markdown fenced "
+    "block and supply the correct language label (for example, ```python ... ``` or ```html ... ```).\n"
+    "Fence ONLY the actual code/markup; keep all narrative or explanatory text outside the fences.\n"
+)
+HTML_ESCAPE_RE = re.compile(r"&(?:lt|gt|amp|quot|apos|#[0-9]+|#x[0-9a-fA-F]+);")
+MARKDOWN_ESCAPE_RE = re.compile(r"\\(?=\s*[-\\`*_{}\[\]()#+.!<>])")
+CODE_FENCE_RE = re.compile(r"(```.*?```|`[^`]*`)", re.DOTALL)
+_UNSET = object()
+def _resolve(value: Any, fallback: Any):
+    return fallback if value is _UNSET else value
 class GeminiClientWrapper(GeminiClient):
         super().__init__(**kwargs)
         self.id = client_id
+    async def init(
+        self,
+        timeout: float = cast(float, _UNSET),
+        auto_close: bool = False,
+        close_delay: float = 300,
+        auto_refresh: bool = cast(bool, _UNSET),
+        refresh_interval: float = cast(float, _UNSET),
+        verbose: bool = cast(bool, _UNSET),
+    ) -> None:
         """
         Inject default configuration values.
         """
+        config = g_config.gemini
+        timeout = cast(float, _resolve(timeout, config.timeout))
+        auto_refresh = cast(bool, _resolve(auto_refresh, config.auto_refresh))
+        refresh_interval = cast(float, _resolve(refresh_interval, config.refresh_interval))
+        verbose = cast(bool, _resolve(verbose, config.verbose))
+        await super().init(
+            timeout=timeout,
+            auto_close=auto_close,
+            close_delay=close_delay,
+            auto_refresh=auto_refresh,
+            refresh_interval=refresh_interval,
+            verbose=verbose,
+        )
     async def generate_content(
         self,
         gem: Gem | str | None = None,
         chat: ChatSession | None = None,
         **kwargs,
+    ) -> ModelOutput:
         cnt = 2  # Try 2 times before giving up
+        last_exception: ModelInvalid | None = None
         while cnt:
             cnt -= 1
             try:
                 return await super().generate_content(prompt, files, model, gem, chat, **kwargs)
             except ModelInvalid as e:
+                # This is not always caused by model selection. Instead, it can be solved by retrying.
                 # So we catch it and retry as a workaround.
                 await asyncio.sleep(1)
                 last_exception = e
         # If retrying failed, re-raise ModelInvalid
+        if last_exception is not None:
             raise last_exception
+        raise RuntimeError("generate_content failed without receiving a ModelInvalid error.")
     @staticmethod
     async def process_message(
         """
         Process a single message and return model input.
         """
         files: list[Path | str] = []
+        text_fragments: list[str] = []
         if isinstance(message.content, str):
             # Pure text content
+            if message.content:
+                text_fragments.append(message.content)
+        elif isinstance(message.content, list):
             # Mixed content
             # TODO: Use Pydantic to enforce the value checking
             for item in message.content:
                 if item.type == "text":
                     # Append multiple text fragments
                     if item.text:
+                        text_fragments.append(item.text)
                 elif item.type == "image_url":
                     if not item.image_url:
                         files.append(await save_file_to_tempfile(file_data, filename, tempdir))
                     else:
                         raise ValueError("File must contain 'file_data' key")
+        elif message.content is not None:
+            raise ValueError("Unsupported message content type.")
+        if message.tool_calls:
+            tool_blocks: list[str] = []
+            for call in message.tool_calls:
+                args_text = call.function.arguments.strip()
+                try:
+                    parsed_args = json.loads(args_text)
+                    args_text = json.dumps(parsed_args, ensure_ascii=False)
+                except (json.JSONDecodeError, TypeError):
+                    # Leave args_text as is if it is not valid JSON
+                    pass
+                tool_blocks.append(
+                    f'<tool_call name="{call.function.name}">{args_text}</tool_call>'
+                )
+            if tool_blocks:
+                tool_section = "```xml\n" + "\n".join(tool_blocks) + "\n```"
+                text_fragments.append(tool_section)
+        model_input = "\n".join(fragment for fragment in text_fragments if fragment)
         # Add role tag if needed
         if model_input:
             if tagged:
+                model_input = add_tag(message.role, model_input)
         return model_input, files
             text += str(response)
         # Fix some escaped characters
+        def _unescape_html(text_content: str) -> str:
+            parts: list[str] = []
+            last_index = 0
+            for match in CODE_FENCE_RE.finditer(text_content):
+                non_code = text_content[last_index : match.start()]
+                if non_code:
+                    parts.append(HTML_ESCAPE_RE.sub(lambda m: html.unescape(m.group(0)), non_code))
+                parts.append(match.group(0))
+                last_index = match.end()
+            tail = text_content[last_index:]
+            if tail:
+                parts.append(HTML_ESCAPE_RE.sub(lambda m: html.unescape(m.group(0)), tail))
+            return "".join(parts)
+        def _unescape_markdown(text_content: str) -> str:
+            parts: list[str] = []
+            last_index = 0
+            for match in CODE_FENCE_RE.finditer(text_content):
+                non_code = text_content[last_index : match.start()]
+                if non_code:
+                    parts.append(MARKDOWN_ESCAPE_RE.sub("", non_code))
+                parts.append(match.group(0))
+                last_index = match.end()
+            tail = text_content[last_index:]
+            if tail:
+                parts.append(MARKDOWN_ESCAPE_RE.sub("", tail))
+            return "".join(parts)
+        text = _unescape_html(text)
+        text = _unescape_markdown(text)
         def simplify_link_target(text_content: str) -> str:
             match_colon_num = re.match(r"([^:]+:\d+)", text_content)
             else:
                 return new_link_segment
+        # Replace Google search links with simplified Markdown links
         pattern = r"(\()?\[`([^`]+?)`\]\((https://www.google.com/search\?q=)(.*?)(?<!\\)\)\)*(\))?"
         text = re.sub(pattern, replacer, text)

app/utils/config.py CHANGED Viewed

@@ -174,7 +174,8 @@ def extract_gemini_clients_env() -> dict[int, dict[str, str]]:
 def _merge_clients_with_env(
-    base_clients: list[GeminiClientSettings] | None, env_overrides: dict[int, dict[str, str]]
 ):
     """Override base_clients with env_overrides, return the new clients list."""
     if not env_overrides:

 def _merge_clients_with_env(
+    base_clients: list[GeminiClientSettings] | None,
+    env_overrides: dict[int, dict[str, str]],
 ):
     """Override base_clients with env_overrides, return the new clients list."""
     if not env_overrides:

app/utils/helper.py CHANGED Viewed

@@ -5,10 +5,12 @@ from pathlib import Path
 import httpx
 from loguru import logger
 def add_tag(role: str, content: str, unclose: bool = False) -> str:
     """Surround content with role tags"""
-    if role not in ["user", "assistant", "system"]:
         logger.warning(f"Unknown role: {role}, returning content without tags")
         return content
@@ -34,6 +36,8 @@ async def save_file_to_tempfile(
 async def save_url_to_tempfile(url: str, tempdir: Path | None = None):
     if url.startswith("data:image/"):
         # Base64 encoded image
         base64_data = url.split(",")[1]

 import httpx
 from loguru import logger
+VALID_TAG_ROLES = {"user", "assistant", "system", "tool"}
 def add_tag(role: str, content: str, unclose: bool = False) -> str:
     """Surround content with role tags"""
+    if role not in VALID_TAG_ROLES:
         logger.warning(f"Unknown role: {role}, returning content without tags")
         return content
 async def save_url_to_tempfile(url: str, tempdir: Path | None = None):
+    data: bytes | None = None
+    suffix: str | None = None
     if url.startswith("data:image/"):
         # Base64 encoded image
         base64_data = url.split(",")[1]

run.py CHANGED Viewed

@@ -20,7 +20,9 @@ if __name__ == "__main__":
         # Check if the certificate files exist
         if not os.path.exists(key_path) or not os.path.exists(cert_path):
-            logger.critical(f"HTTPS enabled but SSL certificate files not found: {key_path}, {cert_path}")
             sys.exit(1)
         logger.info(f"Starting server at https://{g_config.server.host}:{g_config.server.port} ...")

         # Check if the certificate files exist
         if not os.path.exists(key_path) or not os.path.exists(cert_path):
+            logger.critical(
+                f"HTTPS enabled but SSL certificate files not found: {key_path}, {cert_path}"
+            )
             sys.exit(1)
         logger.info(f"Starting server at https://{g_config.server.host}:{g_config.server.port} ...")