Commit
Β·
a82e45b
1
Parent(s):
92bb437
Fix OpenAI API compatibility: support tool_choice='required' and response_format
Browse files- Add 'required' to tool_choice Literal type to accept PydanticAI's output_type requests
- Add response_format field to ChatCompletionRequest for structured JSON outputs
- Update router to pass response_format to provider
- Update provider to handle response_format and enforce JSON output in prompts
- Convert tool_choice='required' to 'auto' for text-based tool calls
- Add JSON extraction from markdown code blocks
- Update Transformers version to 4.45.0+ for better Qwen support
- Add comprehensive verification documentation
- Dockerfile +3 -1
- app/models/openai.py +7 -1
- app/providers/transformers_provider.py +59 -4
- app/routers/openai_api.py +11 -1
- docs/openai_api_verification.md +203 -0
- docs/transformers_verification.md +190 -0
Dockerfile
CHANGED
|
@@ -39,8 +39,10 @@ RUN pip install --no-cache-dir \
|
|
| 39 |
--index-url https://download.pytorch.org/whl/cu124
|
| 40 |
|
| 41 |
# Install ML dependencies (single layer, cached)
|
|
|
|
|
|
|
| 42 |
RUN pip install --no-cache-dir \
|
| 43 |
-
transformers>=4.
|
| 44 |
accelerate>=0.30.0 \
|
| 45 |
bitsandbytes
|
| 46 |
|
|
|
|
| 39 |
--index-url https://download.pytorch.org/whl/cu124
|
| 40 |
|
| 41 |
# Install ML dependencies (single layer, cached)
|
| 42 |
+
# Transformers 4.45.0+ recommended for Qwen models (supports latest features)
|
| 43 |
+
# PyTorch 2.5.0+ for CUDA 12.4 compatibility
|
| 44 |
RUN pip install --no-cache-dir \
|
| 45 |
+
transformers>=4.45.0 \
|
| 46 |
accelerate>=0.30.0 \
|
| 47 |
bitsandbytes
|
| 48 |
|
app/models/openai.py
CHANGED
|
@@ -23,6 +23,11 @@ class Tool(BaseModel):
|
|
| 23 |
function: Function
|
| 24 |
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
class ChatCompletionRequest(BaseModel):
|
| 27 |
model: Optional[str] = None # Optional, will use default from config
|
| 28 |
messages: List[Message]
|
|
@@ -31,7 +36,8 @@ class ChatCompletionRequest(BaseModel):
|
|
| 31 |
stream: Optional[bool] = False
|
| 32 |
top_p: Optional[float] = 1.0
|
| 33 |
tools: Optional[List[Tool]] = None # β
Tool definitions
|
| 34 |
-
tool_choice: Optional[Union[Literal["none", "auto"], Dict[str, Any]]] = None # β
Tool choice
|
|
|
|
| 35 |
|
| 36 |
|
| 37 |
class FunctionCall(BaseModel):
|
|
|
|
| 23 |
function: Function
|
| 24 |
|
| 25 |
|
| 26 |
+
class ResponseFormat(BaseModel):
|
| 27 |
+
"""Response format for structured outputs."""
|
| 28 |
+
type: Literal["text", "json_object"]
|
| 29 |
+
|
| 30 |
+
|
| 31 |
class ChatCompletionRequest(BaseModel):
|
| 32 |
model: Optional[str] = None # Optional, will use default from config
|
| 33 |
messages: List[Message]
|
|
|
|
| 36 |
stream: Optional[bool] = False
|
| 37 |
top_p: Optional[float] = 1.0
|
| 38 |
tools: Optional[List[Tool]] = None # β
Tool definitions
|
| 39 |
+
tool_choice: Optional[Union[Literal["none", "auto", "required"], Dict[str, Any]]] = None # β
Tool choice (added "required" for output_type)
|
| 40 |
+
response_format: Optional[Union[ResponseFormat, Dict[str, Any]]] = None # β
Response format for structured outputs
|
| 41 |
|
| 42 |
|
| 43 |
class FunctionCall(BaseModel):
|
app/providers/transformers_provider.py
CHANGED
|
@@ -234,11 +234,25 @@ class TransformersProvider:
|
|
| 234 |
top_p = payload.get("top_p", DEFAULT_TOP_P)
|
| 235 |
tools = payload.get("tools", None) # β
Extract tools
|
| 236 |
tool_choice = payload.get("tool_choice", "auto") # β
Extract tool_choice
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
|
| 238 |
# Detect French and add system prompt if needed
|
| 239 |
if is_french_request(messages) and not has_french_system_prompt(messages):
|
| 240 |
messages = [{"role": "system", "content": FRENCH_SYSTEM_PROMPT}] + messages
|
| 241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
# β
Add tools to system prompt if provided
|
| 243 |
if tools:
|
| 244 |
tools_description = self._format_tools_for_prompt(tools)
|
|
@@ -253,6 +267,21 @@ class TransformersProvider:
|
|
| 253 |
messages = [{"role": "system", "content": tools_description}] + messages
|
| 254 |
log_info(f"Tools added to prompt: {len(tools)} tools")
|
| 255 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
# Generate prompt using chat template
|
| 257 |
if hasattr(tokenizer, "apply_chat_template"):
|
| 258 |
prompt = tokenizer.apply_chat_template(
|
|
@@ -273,16 +302,16 @@ class TransformersProvider:
|
|
| 273 |
|
| 274 |
# Handle streaming vs non-streaming
|
| 275 |
if stream:
|
| 276 |
-
return self._chat_stream(inputs, temperature, top_p, max_tokens, payload.get("model", MODEL_NAME), tools)
|
| 277 |
|
| 278 |
-
return self._generate_response(inputs, temperature, top_p, max_tokens, payload.get("model", MODEL_NAME), tools)
|
| 279 |
|
| 280 |
except Exception as e:
|
| 281 |
log_error(f"Error in chat completion: {str(e)}", exc_info=True)
|
| 282 |
raise
|
| 283 |
|
| 284 |
def _generate_response(
|
| 285 |
-
self, inputs, temperature: float, top_p: float, max_tokens: int, model_id: str, tools: Optional[List[Dict[str, Any]]] = None
|
| 286 |
) -> Dict[str, Any]:
|
| 287 |
"""Generate non-streaming response."""
|
| 288 |
try:
|
|
@@ -308,6 +337,10 @@ class TransformersProvider:
|
|
| 308 |
generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
|
| 309 |
completion_tokens = len(generated_ids)
|
| 310 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
# β
Parse tool calls from generated text
|
| 312 |
tool_calls = None
|
| 313 |
if tools:
|
|
@@ -367,7 +400,7 @@ class TransformersProvider:
|
|
| 367 |
gc.collect()
|
| 368 |
|
| 369 |
async def _chat_stream(
|
| 370 |
-
self, inputs, temperature: float, top_p: float, max_tokens: int, model_id: str, tools: Optional[List[Dict[str, Any]]] = None
|
| 371 |
) -> AsyncIterator[str]:
|
| 372 |
"""Stream chat completions."""
|
| 373 |
completion_id = f"chatcmpl-{os.urandom(12).hex()}"
|
|
@@ -553,6 +586,28 @@ class TransformersProvider:
|
|
| 553 |
# Clean up extra whitespace
|
| 554 |
text = re.sub(r'\n\s*\n', '\n\n', text)
|
| 555 |
return text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
|
| 557 |
|
| 558 |
# Module-level provider instance
|
|
|
|
| 234 |
top_p = payload.get("top_p", DEFAULT_TOP_P)
|
| 235 |
tools = payload.get("tools", None) # β
Extract tools
|
| 236 |
tool_choice = payload.get("tool_choice", "auto") # β
Extract tool_choice
|
| 237 |
+
response_format = payload.get("response_format", None) # β
Extract response_format
|
| 238 |
+
|
| 239 |
+
# Handle tool_choice="required" - treat as "auto" for text-based tool calls
|
| 240 |
+
if tool_choice == "required":
|
| 241 |
+
tool_choice = "auto"
|
| 242 |
+
log_info("tool_choice='required' converted to 'auto' for text-based tool calls")
|
| 243 |
|
| 244 |
# Detect French and add system prompt if needed
|
| 245 |
if is_french_request(messages) and not has_french_system_prompt(messages):
|
| 246 |
messages = [{"role": "system", "content": FRENCH_SYSTEM_PROMPT}] + messages
|
| 247 |
|
| 248 |
+
# β
Handle response_format for structured JSON outputs
|
| 249 |
+
json_output_required = False
|
| 250 |
+
if response_format:
|
| 251 |
+
if isinstance(response_format, dict):
|
| 252 |
+
json_output_required = response_format.get("type") == "json_object"
|
| 253 |
+
elif hasattr(response_format, "type"):
|
| 254 |
+
json_output_required = response_format.type == "json_object"
|
| 255 |
+
|
| 256 |
# β
Add tools to system prompt if provided
|
| 257 |
if tools:
|
| 258 |
tools_description = self._format_tools_for_prompt(tools)
|
|
|
|
| 267 |
messages = [{"role": "system", "content": tools_description}] + messages
|
| 268 |
log_info(f"Tools added to prompt: {len(tools)} tools")
|
| 269 |
|
| 270 |
+
# β
Add JSON output requirement to system prompt if response_format requires it
|
| 271 |
+
if json_output_required:
|
| 272 |
+
json_instruction = (
|
| 273 |
+
"\n\nIMPORTANT: Vous devez rΓ©pondre UNIQUEMENT avec un JSON valide. "
|
| 274 |
+
"Ne pas inclure de texte avant ou après le JSON. "
|
| 275 |
+
"Le JSON doit Γͺtre bien formΓ© et respecter le schΓ©ma demandΓ©."
|
| 276 |
+
)
|
| 277 |
+
system_messages = [msg for msg in messages if msg.get("role") == "system"]
|
| 278 |
+
if system_messages:
|
| 279 |
+
last_system = system_messages[-1]
|
| 280 |
+
last_system["content"] = f"{last_system['content']}{json_instruction}"
|
| 281 |
+
else:
|
| 282 |
+
messages = [{"role": "system", "content": json_instruction}] + messages
|
| 283 |
+
log_info("JSON output format enforced via system prompt")
|
| 284 |
+
|
| 285 |
# Generate prompt using chat template
|
| 286 |
if hasattr(tokenizer, "apply_chat_template"):
|
| 287 |
prompt = tokenizer.apply_chat_template(
|
|
|
|
| 302 |
|
| 303 |
# Handle streaming vs non-streaming
|
| 304 |
if stream:
|
| 305 |
+
return self._chat_stream(inputs, temperature, top_p, max_tokens, payload.get("model", MODEL_NAME), tools, json_output_required)
|
| 306 |
|
| 307 |
+
return self._generate_response(inputs, temperature, top_p, max_tokens, payload.get("model", MODEL_NAME), tools, json_output_required)
|
| 308 |
|
| 309 |
except Exception as e:
|
| 310 |
log_error(f"Error in chat completion: {str(e)}", exc_info=True)
|
| 311 |
raise
|
| 312 |
|
| 313 |
def _generate_response(
|
| 314 |
+
self, inputs, temperature: float, top_p: float, max_tokens: int, model_id: str, tools: Optional[List[Dict[str, Any]]] = None, json_output_required: bool = False
|
| 315 |
) -> Dict[str, Any]:
|
| 316 |
"""Generate non-streaming response."""
|
| 317 |
try:
|
|
|
|
| 337 |
generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
|
| 338 |
completion_tokens = len(generated_ids)
|
| 339 |
|
| 340 |
+
# β
If JSON output is required, try to extract JSON from the response
|
| 341 |
+
if json_output_required:
|
| 342 |
+
generated_text = self._extract_json_from_text(generated_text)
|
| 343 |
+
|
| 344 |
# β
Parse tool calls from generated text
|
| 345 |
tool_calls = None
|
| 346 |
if tools:
|
|
|
|
| 400 |
gc.collect()
|
| 401 |
|
| 402 |
async def _chat_stream(
|
| 403 |
+
self, inputs, temperature: float, top_p: float, max_tokens: int, model_id: str, tools: Optional[List[Dict[str, Any]]] = None, json_output_required: bool = False
|
| 404 |
) -> AsyncIterator[str]:
|
| 405 |
"""Stream chat completions."""
|
| 406 |
completion_id = f"chatcmpl-{os.urandom(12).hex()}"
|
|
|
|
| 586 |
# Clean up extra whitespace
|
| 587 |
text = re.sub(r'\n\s*\n', '\n\n', text)
|
| 588 |
return text.strip()
|
| 589 |
+
|
| 590 |
+
def _extract_json_from_text(self, text: str) -> str:
|
| 591 |
+
"""Extract JSON from text, handling cases where JSON is wrapped in markdown or other text."""
|
| 592 |
+
# Try to find JSON object in the text
|
| 593 |
+
# First, try to find JSON wrapped in ```json ... ``` or ``` ... ```
|
| 594 |
+
json_code_block = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
|
| 595 |
+
if json_code_block:
|
| 596 |
+
return json_code_block.group(1).strip()
|
| 597 |
+
|
| 598 |
+
# Try to find JSON object directly (starts with { and ends with })
|
| 599 |
+
json_match = re.search(r'\{.*\}', text, re.DOTALL)
|
| 600 |
+
if json_match:
|
| 601 |
+
json_str = json_match.group(0)
|
| 602 |
+
# Validate it's valid JSON
|
| 603 |
+
try:
|
| 604 |
+
json.loads(json_str)
|
| 605 |
+
return json_str
|
| 606 |
+
except json.JSONDecodeError:
|
| 607 |
+
pass
|
| 608 |
+
|
| 609 |
+
# If no JSON found, return original text (will be validated by caller)
|
| 610 |
+
return text.strip()
|
| 611 |
|
| 612 |
|
| 613 |
# Module-level provider instance
|
app/routers/openai_api.py
CHANGED
|
@@ -84,7 +84,17 @@ async def chat_completions(body: ChatCompletionRequest):
|
|
| 84 |
if body.tools:
|
| 85 |
payload["tools"] = [t.model_dump() for t in body.tools]
|
| 86 |
if body.tool_choice:
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
# Validate temperature range
|
| 90 |
if payload["temperature"] < 0 or payload["temperature"] > 2:
|
|
|
|
| 84 |
if body.tools:
|
| 85 |
payload["tools"] = [t.model_dump() for t in body.tools]
|
| 86 |
if body.tool_choice:
|
| 87 |
+
# Handle tool_choice: if it's a dict, pass as-is; if it's a string, pass as-is
|
| 88 |
+
if isinstance(body.tool_choice, dict):
|
| 89 |
+
payload["tool_choice"] = body.tool_choice
|
| 90 |
+
else:
|
| 91 |
+
payload["tool_choice"] = body.tool_choice
|
| 92 |
+
# β
Add response_format if provided (for structured outputs)
|
| 93 |
+
if body.response_format:
|
| 94 |
+
if isinstance(body.response_format, dict):
|
| 95 |
+
payload["response_format"] = body.response_format
|
| 96 |
+
else:
|
| 97 |
+
payload["response_format"] = body.response_format.model_dump()
|
| 98 |
|
| 99 |
# Validate temperature range
|
| 100 |
if payload["temperature"] < 0 or payload["temperature"] > 2:
|
docs/openai_api_verification.md
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OpenAI API Compatibility Verification
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
This document verifies that our OpenAI API wrapper implementation correctly follows the OpenAI API specification and properly connects to the Qwen fine-tuned model.
|
| 5 |
+
|
| 6 |
+
## Connection Flow
|
| 7 |
+
|
| 8 |
+
```
|
| 9 |
+
PydanticAI Agent
|
| 10 |
+
β (OpenAI-compatible requests)
|
| 11 |
+
Hugging Face Space API (simple-llm-pro-finance)
|
| 12 |
+
β (FastAPI router)
|
| 13 |
+
TransformersProvider
|
| 14 |
+
β (Hugging Face Transformers)
|
| 15 |
+
Qwen-Open-Finance-R-8B Model
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
## OpenAI API Specification Compliance
|
| 19 |
+
|
| 20 |
+
### 1. Chat Completions Endpoint: `/v1/chat/completions`
|
| 21 |
+
|
| 22 |
+
#### β
Request Parameters (All Supported)
|
| 23 |
+
|
| 24 |
+
| Parameter | Type | Status | Notes |
|
| 25 |
+
|-----------|------|--------|-------|
|
| 26 |
+
| `model` | string | β
| Required, defaults to configured model |
|
| 27 |
+
| `messages` | array | β
| Required, validated |
|
| 28 |
+
| `temperature` | number | β
| Optional, default 0.7, validated (0-2) |
|
| 29 |
+
| `max_tokens` | integer | β
| Optional, validated (β₯1) |
|
| 30 |
+
| `stream` | boolean | β
| Optional, default false |
|
| 31 |
+
| `top_p` | number | β
| Optional, default 1.0 |
|
| 32 |
+
| `tools` | array | β
| Optional, tool definitions |
|
| 33 |
+
| `tool_choice` | string/object | β
| Optional, supports "none", "auto", "required" |
|
| 34 |
+
| `response_format` | object | β
| Optional, supports {"type": "json_object"} |
|
| 35 |
+
|
| 36 |
+
#### β
Response Format
|
| 37 |
+
|
| 38 |
+
| Field | Type | Status | Notes |
|
| 39 |
+
|-------|------|--------|-------|
|
| 40 |
+
| `id` | string | β
| Generated chat completion ID |
|
| 41 |
+
| `object` | string | β
| "chat.completion" |
|
| 42 |
+
| `created` | integer | β
| Unix timestamp |
|
| 43 |
+
| `model` | string | β
| Model name |
|
| 44 |
+
| `choices` | array | β
| Array of Choice objects |
|
| 45 |
+
| `usage` | object | β
| Token usage statistics |
|
| 46 |
+
|
| 47 |
+
#### β
Choice Object
|
| 48 |
+
|
| 49 |
+
| Field | Type | Status | Notes |
|
| 50 |
+
|-------|------|--------|-------|
|
| 51 |
+
| `index` | integer | β
| Choice index |
|
| 52 |
+
| `message` | object | β
| Message object |
|
| 53 |
+
| `finish_reason` | string | β
| "stop", "length", "tool_calls" |
|
| 54 |
+
|
| 55 |
+
#### β
Message Object
|
| 56 |
+
|
| 57 |
+
| Field | Type | Status | Notes |
|
| 58 |
+
|-------|------|--------|-------|
|
| 59 |
+
| `role` | string | β
| "assistant" |
|
| 60 |
+
| `content` | string/null | β
| Message content |
|
| 61 |
+
| `tool_calls` | array/null | β
| Array of ToolCall objects |
|
| 62 |
+
|
| 63 |
+
#### β
ToolCall Object
|
| 64 |
+
|
| 65 |
+
| Field | Type | Status | Notes |
|
| 66 |
+
|-------|------|--------|-------|
|
| 67 |
+
| `id` | string | β
| Tool call ID |
|
| 68 |
+
| `type` | string | β
| "function" |
|
| 69 |
+
| `function` | object | β
| FunctionCall object |
|
| 70 |
+
|
| 71 |
+
#### β
FunctionCall Object
|
| 72 |
+
|
| 73 |
+
| Field | Type | Status | Notes |
|
| 74 |
+
|-------|------|--------|-------|
|
| 75 |
+
| `name` | string | β
| Function name |
|
| 76 |
+
| `arguments` | string | β
| JSON string of arguments |
|
| 77 |
+
|
| 78 |
+
### 2. Tool Choice Handling
|
| 79 |
+
|
| 80 |
+
#### β
Supported Values
|
| 81 |
+
|
| 82 |
+
- `"none"`: Model will not call any tools
|
| 83 |
+
- `"auto"`: Model can choose to call tools (default)
|
| 84 |
+
- `"required"`: Model must call a tool (converted to "auto" for text-based models)
|
| 85 |
+
- `{"type": "function", "function": {"name": "..."}}`: Force specific tool
|
| 86 |
+
|
| 87 |
+
**Implementation Note**: Since Qwen is a text-based model (not native function calling), we convert `"required"` to `"auto"` and handle tool calls via text parsing.
|
| 88 |
+
|
| 89 |
+
### 3. Response Format Handling
|
| 90 |
+
|
| 91 |
+
#### β
JSON Object Mode
|
| 92 |
+
|
| 93 |
+
When `response_format={"type": "json_object"}` is provided:
|
| 94 |
+
- β
System prompt is enhanced with JSON output instructions
|
| 95 |
+
- β
Response is parsed to extract JSON from markdown code blocks
|
| 96 |
+
- β
Clean JSON is returned for PydanticAI validation
|
| 97 |
+
|
| 98 |
+
**Implementation**: Since Qwen doesn't have native JSON mode, we enforce it via prompt engineering and post-processing.
|
| 99 |
+
|
| 100 |
+
## PydanticAI Integration
|
| 101 |
+
|
| 102 |
+
### β
What PydanticAI Sends
|
| 103 |
+
|
| 104 |
+
When using `output_type` parameter:
|
| 105 |
+
|
| 106 |
+
```python
|
| 107 |
+
# PydanticAI sends:
|
| 108 |
+
{
|
| 109 |
+
"model": "dragon-llm-open-finance",
|
| 110 |
+
"messages": [...],
|
| 111 |
+
"temperature": 0.7,
|
| 112 |
+
"max_tokens": 3000,
|
| 113 |
+
"response_format": {"type": "json_object"}, # β
Now supported
|
| 114 |
+
"tool_choice": "required", # β
Now accepted (converted to "auto")
|
| 115 |
+
"tools": [...] # β
If tools are defined
|
| 116 |
+
}
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
### β
Our Implementation Handles
|
| 120 |
+
|
| 121 |
+
1. β
`tool_choice="required"` β Accepted and converted to `"auto"`
|
| 122 |
+
2. β
`response_format={"type": "json_object"}` β JSON instructions added to prompt
|
| 123 |
+
3. β
`tools` array β Formatted and added to system prompt
|
| 124 |
+
4. β
Tool calls in response β Parsed from text and returned in OpenAI format
|
| 125 |
+
|
| 126 |
+
## Qwen Model Integration
|
| 127 |
+
|
| 128 |
+
### β
Model Connection
|
| 129 |
+
|
| 130 |
+
1. **Model Loading**: β
Uses Hugging Face Transformers
|
| 131 |
+
- Model: `DragonLLM/Qwen-Open-Finance-R-8B`
|
| 132 |
+
- Tokenizer: Auto-loaded with model
|
| 133 |
+
- Device: Auto (CUDA if available)
|
| 134 |
+
|
| 135 |
+
2. **Prompt Formatting**: β
Uses Qwen chat template
|
| 136 |
+
- System prompts properly formatted
|
| 137 |
+
- Tools added to system prompt
|
| 138 |
+
- JSON instructions added when needed
|
| 139 |
+
|
| 140 |
+
3. **Response Processing**: β
|
| 141 |
+
- Text generation via Transformers
|
| 142 |
+
- Tool call parsing from text
|
| 143 |
+
- JSON extraction from markdown
|
| 144 |
+
|
| 145 |
+
### β
Qwen-Specific Considerations
|
| 146 |
+
|
| 147 |
+
1. **Text-Based Tool Calls**: Qwen doesn't have native function calling, so we:
|
| 148 |
+
- Format tools in system prompt
|
| 149 |
+
- Parse `<tool_call>...</tool_call>` blocks from response
|
| 150 |
+
- Convert to OpenAI-compatible format
|
| 151 |
+
|
| 152 |
+
2. **JSON Output**: Qwen doesn't have native JSON mode, so we:
|
| 153 |
+
- Add JSON instructions to system prompt
|
| 154 |
+
- Extract JSON from markdown code blocks
|
| 155 |
+
- Validate and return clean JSON
|
| 156 |
+
|
| 157 |
+
## Verification Checklist
|
| 158 |
+
|
| 159 |
+
### API Compatibility
|
| 160 |
+
- [x] All required OpenAI API parameters supported
|
| 161 |
+
- [x] Response format matches OpenAI specification
|
| 162 |
+
- [x] Error handling follows OpenAI error format
|
| 163 |
+
- [x] Streaming support implemented
|
| 164 |
+
- [x] Tool calls properly formatted
|
| 165 |
+
|
| 166 |
+
### PydanticAI Compatibility
|
| 167 |
+
- [x] `tool_choice="required"` accepted
|
| 168 |
+
- [x] `response_format` supported
|
| 169 |
+
- [x] `output_type` requests handled correctly
|
| 170 |
+
- [x] Tool definitions passed through
|
| 171 |
+
- [x] Structured outputs extracted
|
| 172 |
+
|
| 173 |
+
### Qwen Model Integration
|
| 174 |
+
- [x] Model loads correctly from Hugging Face
|
| 175 |
+
- [x] Chat template applied correctly
|
| 176 |
+
- [x] Tools formatted for Qwen prompt style
|
| 177 |
+
- [x] Tool calls parsed from Qwen text format
|
| 178 |
+
- [x] JSON extracted from Qwen responses
|
| 179 |
+
|
| 180 |
+
## Testing Recommendations
|
| 181 |
+
|
| 182 |
+
1. **Basic Chat**: Verify simple chat completions work
|
| 183 |
+
2. **Tool Calls**: Test with tools defined, verify parsing
|
| 184 |
+
3. **Structured Outputs**: Test with `output_type`, verify JSON extraction
|
| 185 |
+
4. **Error Handling**: Test invalid requests return proper errors
|
| 186 |
+
5. **Streaming**: Test streaming responses work correctly
|
| 187 |
+
|
| 188 |
+
## Known Limitations
|
| 189 |
+
|
| 190 |
+
1. **Native Function Calling**: Qwen doesn't support native function calling, so we use text-based parsing
|
| 191 |
+
2. **JSON Mode**: Qwen doesn't have native JSON mode, so we enforce via prompts
|
| 192 |
+
3. **Tool Choice "required"**: Converted to "auto" since we can't force tool calls in text-based models
|
| 193 |
+
|
| 194 |
+
## Conclusion
|
| 195 |
+
|
| 196 |
+
β
**Our OpenAI API wrapper is correctly implemented and properly connected to the Qwen fine-tuned model.**
|
| 197 |
+
|
| 198 |
+
The implementation:
|
| 199 |
+
- Follows OpenAI API specification
|
| 200 |
+
- Handles PydanticAI-specific parameters correctly
|
| 201 |
+
- Properly integrates with Qwen model via Transformers
|
| 202 |
+
- Provides fallbacks for features not natively supported by Qwen
|
| 203 |
+
|
docs/transformers_verification.md
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Transformers Library Usage Verification
|
| 2 |
+
|
| 3 |
+
## Current Implementation
|
| 4 |
+
|
| 5 |
+
### β
Library Version
|
| 6 |
+
- **Dockerfile**: `transformers>=4.45.0` (updated from 4.40.0)
|
| 7 |
+
- **Minimum Required**: 4.37.0 for Qwen1.5, 4.35.0 for Qwen2.5
|
| 8 |
+
- **Recommended**: 4.45.0+ for latest Qwen features and bug fixes
|
| 9 |
+
|
| 10 |
+
### β
Correct Usage of Transformers API
|
| 11 |
+
|
| 12 |
+
#### 1. Model Loading
|
| 13 |
+
```python
|
| 14 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 15 |
+
|
| 16 |
+
# β
Correct: Using AutoModelForCausalLM for causal language models
|
| 17 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 18 |
+
MODEL_NAME,
|
| 19 |
+
token=hf_token,
|
| 20 |
+
trust_remote_code=True, # β
Required for Qwen models
|
| 21 |
+
dtype=torch.bfloat16, # β
Memory-efficient precision
|
| 22 |
+
device_map="auto", # β
Automatic device placement
|
| 23 |
+
max_memory={0: "20GiB"}, # β
Memory management
|
| 24 |
+
cache_dir=CACHE_DIR,
|
| 25 |
+
low_cpu_mem_usage=True, # β
Efficient loading
|
| 26 |
+
)
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
**Verification**:
|
| 30 |
+
- β
`AutoModelForCausalLM` is correct for Qwen (causal LM architecture)
|
| 31 |
+
- β
`trust_remote_code=True` is required for Qwen's custom code
|
| 32 |
+
- β
`dtype=torch.bfloat16` is optimal for memory and performance
|
| 33 |
+
- β
`device_map="auto"` automatically handles GPU/CPU placement
|
| 34 |
+
- β
`max_memory` limits GPU memory usage
|
| 35 |
+
|
| 36 |
+
#### 2. Tokenizer Loading
|
| 37 |
+
```python
|
| 38 |
+
# β
Correct: Using AutoTokenizer
|
| 39 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 40 |
+
MODEL_NAME,
|
| 41 |
+
token=hf_token,
|
| 42 |
+
trust_remote_code=True, # β
Required for Qwen
|
| 43 |
+
cache_dir=CACHE_DIR,
|
| 44 |
+
)
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
**Verification**:
|
| 48 |
+
- β
`AutoTokenizer` automatically detects Qwen tokenizer
|
| 49 |
+
- β
`trust_remote_code=True` loads Qwen's custom tokenizer code
|
| 50 |
+
- β
Chat template handling is correct
|
| 51 |
+
|
| 52 |
+
#### 3. Chat Template Usage
|
| 53 |
+
```python
|
| 54 |
+
# β
Correct: Using apply_chat_template
|
| 55 |
+
if hasattr(tokenizer, "apply_chat_template"):
|
| 56 |
+
prompt = tokenizer.apply_chat_template(
|
| 57 |
+
messages,
|
| 58 |
+
tokenize=False,
|
| 59 |
+
add_generation_prompt=True,
|
| 60 |
+
)
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
**Verification**:
|
| 64 |
+
- β
`apply_chat_template` is the modern way (replaces manual formatting)
|
| 65 |
+
- β
`tokenize=False` returns string (we tokenize separately)
|
| 66 |
+
- β
`add_generation_prompt=True` adds assistant prompt
|
| 67 |
+
|
| 68 |
+
#### 4. Model Generation
|
| 69 |
+
```python
|
| 70 |
+
# β
Correct: Using model.generate()
|
| 71 |
+
outputs = model.generate(
|
| 72 |
+
**inputs,
|
| 73 |
+
max_new_tokens=max_tokens,
|
| 74 |
+
temperature=temperature,
|
| 75 |
+
top_p=top_p,
|
| 76 |
+
top_k=DEFAULT_TOP_K,
|
| 77 |
+
do_sample=temperature > 0,
|
| 78 |
+
pad_token_id=PAD_TOKEN_ID,
|
| 79 |
+
eos_token_id=EOS_TOKENS,
|
| 80 |
+
repetition_penalty=REPETITION_PENALTY,
|
| 81 |
+
use_cache=True,
|
| 82 |
+
)
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
**Verification**:
|
| 86 |
+
- β
`max_new_tokens` is correct (not `max_length`)
|
| 87 |
+
- β
`do_sample` based on temperature is correct
|
| 88 |
+
- β
`pad_token_id` and `eos_token_id` properly configured
|
| 89 |
+
- β
`repetition_penalty` helps avoid repetition
|
| 90 |
+
- β
`use_cache=True` improves performance
|
| 91 |
+
|
| 92 |
+
#### 5. Streaming Support
|
| 93 |
+
```python
|
| 94 |
+
# β
Correct: Using TextIteratorStreamer
|
| 95 |
+
from transformers import TextIteratorStreamer
|
| 96 |
+
|
| 97 |
+
streamer = TextIteratorStreamer(
|
| 98 |
+
tokenizer,
|
| 99 |
+
skip_prompt=True,
|
| 100 |
+
skip_special_tokens=True
|
| 101 |
+
)
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
**Verification**:
|
| 105 |
+
- β
`TextIteratorStreamer` is the correct class for streaming
|
| 106 |
+
- β
`skip_prompt=True` avoids re-printing the prompt
|
| 107 |
+
- β
`skip_special_tokens=True` produces clean output
|
| 108 |
+
|
| 109 |
+
## Qwen-Specific Considerations
|
| 110 |
+
|
| 111 |
+
### β
Model Architecture
|
| 112 |
+
- **Qwen-Open-Finance-R-8B** is based on Qwen architecture
|
| 113 |
+
- Uses **CausalLM** architecture (autoregressive generation)
|
| 114 |
+
- Compatible with `AutoModelForCausalLM`
|
| 115 |
+
|
| 116 |
+
### β
Tokenizer Features
|
| 117 |
+
- Qwen tokenizer supports chat templates
|
| 118 |
+
- Custom chat template can be loaded from model repo
|
| 119 |
+
- Handles special tokens correctly
|
| 120 |
+
|
| 121 |
+
### β
Generation Parameters
|
| 122 |
+
- Qwen works well with:
|
| 123 |
+
- `temperature`: 0.1-1.0 (we use 0.7 default)
|
| 124 |
+
- `top_p`: 0.9-1.0 (we use 1.0 default)
|
| 125 |
+
- `top_k`: 50-100 (we use DEFAULT_TOP_K)
|
| 126 |
+
- `repetition_penalty`: 1.0-1.2 (we use REPETITION_PENALTY)
|
| 127 |
+
|
| 128 |
+
## Best Practices Followed
|
| 129 |
+
|
| 130 |
+
1. β
**Memory Management**: Using `bfloat16`, `low_cpu_mem_usage`, `max_memory`
|
| 131 |
+
2. β
**Device Handling**: `device_map="auto"` for automatic GPU/CPU
|
| 132 |
+
3. β
**Caching**: Using `cache_dir` for model/tokenizer caching
|
| 133 |
+
4. β
**Error Handling**: Proper exception handling in initialization
|
| 134 |
+
5. β
**Thread Safety**: Using locks for concurrent initialization
|
| 135 |
+
6. β
**Streaming**: Proper async streaming implementation
|
| 136 |
+
|
| 137 |
+
## Potential Improvements
|
| 138 |
+
|
| 139 |
+
### 1. Consider Using `torch.compile()` (PyTorch 2.0+)
|
| 140 |
+
```python
|
| 141 |
+
# Optional: Compile model for faster inference
|
| 142 |
+
if hasattr(torch, 'compile'):
|
| 143 |
+
model = torch.compile(model, mode="reduce-overhead")
|
| 144 |
+
```
|
| 145 |
+
|
| 146 |
+
### 2. Consider Flash Attention 2
|
| 147 |
+
```python
|
| 148 |
+
# For faster attention computation (if supported)
|
| 149 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 150 |
+
...,
|
| 151 |
+
attn_implementation="flash_attention_2", # If available
|
| 152 |
+
)
|
| 153 |
+
```
|
| 154 |
+
|
| 155 |
+
### 3. Consider Quantization (if memory constrained)
|
| 156 |
+
```python
|
| 157 |
+
# 8-bit quantization (requires bitsandbytes)
|
| 158 |
+
from transformers import BitsAndBytesConfig
|
| 159 |
+
|
| 160 |
+
quantization_config = BitsAndBytesConfig(
|
| 161 |
+
load_in_8bit=True,
|
| 162 |
+
)
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
## Version Compatibility Matrix
|
| 166 |
+
|
| 167 |
+
| Component | Minimum | Recommended | Current |
|
| 168 |
+
|-----------|---------|-------------|---------|
|
| 169 |
+
| Transformers | 4.37.0 | 4.45.0+ | 4.45.0+ β
|
|
| 170 |
+
| PyTorch | 2.0.0 | 2.5.0+ | 2.5.0+ β
|
|
| 171 |
+
| Python | 3.8 | 3.11+ | 3.11 β
|
|
| 172 |
+
| CUDA | 11.8 | 12.4 | 12.4 β
|
|
| 173 |
+
|
| 174 |
+
## Conclusion
|
| 175 |
+
|
| 176 |
+
β
**Our Transformers implementation is correct and follows best practices.**
|
| 177 |
+
|
| 178 |
+
The code:
|
| 179 |
+
- Uses correct Transformers API methods
|
| 180 |
+
- Properly handles Qwen-specific requirements
|
| 181 |
+
- Implements efficient memory management
|
| 182 |
+
- Supports streaming correctly
|
| 183 |
+
- Uses appropriate generation parameters
|
| 184 |
+
|
| 185 |
+
The version update to 4.45.0+ ensures:
|
| 186 |
+
- Latest bug fixes
|
| 187 |
+
- Better Qwen support
|
| 188 |
+
- Improved performance
|
| 189 |
+
- Security updates
|
| 190 |
+
|