Spaces:

elmerzole
/

llm-api-proxy

Paused

llm-api-proxy / src /rotator_library /providers /qwen_code_provider.py

Mirrowel

refactor(core): 🔨 centralize path management for PyInstaller compatibility

467f294 3 months ago

28.3 kB

	# src/rotator_library/providers/qwen_code_provider.py

	import copy
	import json
	import time
	import os
	import httpx
	import logging
	from typing import Union, AsyncGenerator, List, Dict, Any
	from .provider_interface import ProviderInterface
	from .qwen_auth_base import QwenAuthBase
	from ..model_definitions import ModelDefinitions
	from ..timeout_config import TimeoutConfig
	from ..utils.paths import get_logs_dir
	import litellm
	from litellm.exceptions import RateLimitError, AuthenticationError
	from pathlib import Path
	import uuid
	from datetime import datetime

	lib_logger = logging.getLogger("rotator_library")


	def _get_qwen_code_logs_dir() -> Path:
	"""Get the Qwen Code logs directory."""
	logs_dir = get_logs_dir() / "qwen_code_logs"
	logs_dir.mkdir(parents=True, exist_ok=True)
	return logs_dir


	class _QwenCodeFileLogger:
	"""A simple file logger for a single Qwen Code transaction."""

	def __init__(self, model_name: str, enabled: bool = True):
	self.enabled = enabled
	if not self.enabled:
	return

	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
	request_id = str(uuid.uuid4())
	# Sanitize model name for directory
	safe_model_name = model_name.replace("/", "_").replace(":", "_")
	self.log_dir = (
	_get_qwen_code_logs_dir() / f"{timestamp}_{safe_model_name}_{request_id}"
	)
	try:
	self.log_dir.mkdir(parents=True, exist_ok=True)
	except Exception as e:
	lib_logger.error(f"Failed to create Qwen Code log directory: {e}")
	self.enabled = False

	def log_request(self, payload: Dict[str, Any]):
	"""Logs the request payload sent to Qwen Code."""
	if not self.enabled:
	return
	try:
	with open(
	self.log_dir / "request_payload.json", "w", encoding="utf-8"
	) as f:
	json.dump(payload, f, indent=2, ensure_ascii=False)
	except Exception as e:
	lib_logger.error(f"_QwenCodeFileLogger: Failed to write request: {e}")

	def log_response_chunk(self, chunk: str):
	"""Logs a raw chunk from the Qwen Code response stream."""
	if not self.enabled:
	return
	try:
	with open(self.log_dir / "response_stream.log", "a", encoding="utf-8") as f:
	f.write(chunk + "\n")
	except Exception as e:
	lib_logger.error(
	f"_QwenCodeFileLogger: Failed to write response chunk: {e}"
	)

	def log_error(self, error_message: str):
	"""Logs an error message."""
	if not self.enabled:
	return
	try:
	with open(self.log_dir / "error.log", "a", encoding="utf-8") as f:
	f.write(f"[{datetime.utcnow().isoformat()}] {error_message}\n")
	except Exception as e:
	lib_logger.error(f"_QwenCodeFileLogger: Failed to write error: {e}")

	def log_final_response(self, response_data: Dict[str, Any]):
	"""Logs the final, reassembled response."""
	if not self.enabled:
	return
	try:
	with open(self.log_dir / "final_response.json", "w", encoding="utf-8") as f:
	json.dump(response_data, f, indent=2, ensure_ascii=False)
	except Exception as e:
	lib_logger.error(
	f"_QwenCodeFileLogger: Failed to write final response: {e}"
	)


	HARDCODED_MODELS = ["qwen3-coder-plus", "qwen3-coder-flash"]

	# OpenAI-compatible parameters supported by Qwen Code API
	SUPPORTED_PARAMS = {
	"model",
	"messages",
	"temperature",
	"top_p",
	"max_tokens",
	"stream",
	"tools",
	"tool_choice",
	"presence_penalty",
	"frequency_penalty",
	"n",
	"stop",
	"seed",
	"response_format",
	}


	class QwenCodeProvider(QwenAuthBase, ProviderInterface):
	skip_cost_calculation = True
	REASONING_START_MARKER = "THINK\|\|"

	def __init__(self):
	super().__init__()
	self.model_definitions = ModelDefinitions()

	def has_custom_logic(self) -> bool:
	return True

	async def get_models(self, credential: str, client: httpx.AsyncClient) -> List[str]:
	"""
	Returns a merged list of Qwen Code models from three sources:
	1. Environment variable models (via QWEN_CODE_MODELS) - ALWAYS included, take priority
	2. Hardcoded models (fallback list) - added only if ID not in env vars
	3. Dynamic discovery from Qwen API (if supported) - added only if ID not in env vars

	Environment variable models always win and are never deduplicated, even if they
	share the same ID (to support different configs like temperature, etc.)

	Validates OAuth credentials if applicable.
	"""
	models = []
	env_var_ids = (
	set()
	) # Track IDs from env vars to prevent hardcoded/dynamic duplicates

	def extract_model_id(item) -> str:
	"""Extract model ID from various formats (dict, string with/without provider prefix)."""
	if isinstance(item, dict):
	# Dict format: extract 'id' or 'name' field
	return item.get("id") or item.get("name", "")
	elif isinstance(item, str):
	# String format: extract ID from "provider/id" or just "id"
	return item.split("/")[-1] if "/" in item else item
	return str(item)

	# Source 1: Load environment variable models (ALWAYS include ALL of them)
	static_models = self.model_definitions.get_all_provider_models("qwen_code")
	if static_models:
	for model in static_models:
	# Extract model name from "qwen_code/ModelName" format
	model_name = model.split("/")[-1] if "/" in model else model
	# Get the actual model ID from definitions (which may differ from the name)
	model_id = self.model_definitions.get_model_id("qwen_code", model_name)

	# ALWAYS add env var models (no deduplication)
	models.append(model)
	# Track the ID to prevent hardcoded/dynamic duplicates
	if model_id:
	env_var_ids.add(model_id)
	lib_logger.info(
	f"Loaded {len(static_models)} static models for qwen_code from environment variables"
	)

	# Source 2: Add hardcoded models (only if ID not already in env vars)
	for model_id in HARDCODED_MODELS:
	if model_id not in env_var_ids:
	models.append(f"qwen_code/{model_id}")
	env_var_ids.add(model_id)

	# Source 3: Try dynamic discovery from Qwen Code API (only if ID not already in env vars)
	try:
	# Validate OAuth credentials and get API details
	if os.path.isfile(credential):
	await self.initialize_token(credential)

	api_base, access_token = await self.get_api_details(credential)
	models_url = f"{api_base.rstrip('/')}/v1/models"

	response = await client.get(
	models_url, headers={"Authorization": f"Bearer {access_token}"}
	)
	response.raise_for_status()

	dynamic_data = response.json()
	# Handle both {data: [...]} and direct [...] formats
	model_list = (
	dynamic_data.get("data", dynamic_data)
	if isinstance(dynamic_data, dict)
	else dynamic_data
	)

	dynamic_count = 0
	for model in model_list:
	model_id = extract_model_id(model)
	if model_id and model_id not in env_var_ids:
	models.append(f"qwen_code/{model_id}")
	env_var_ids.add(model_id)
	dynamic_count += 1

	if dynamic_count > 0:
	lib_logger.debug(
	f"Discovered {dynamic_count} additional models for qwen_code from API"
	)

	except Exception as e:
	# Silently ignore dynamic discovery errors
	lib_logger.debug(f"Dynamic model discovery failed for qwen_code: {e}")
	pass

	return models

	def _clean_tool_schemas(self, tools: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	"""
	Removes unsupported properties from tool schemas to prevent API errors.
	Adapted for Qwen's API requirements.
	"""
	cleaned_tools = []

	for tool in tools:
	cleaned_tool = copy.deepcopy(tool)

	if "function" in cleaned_tool:
	func = cleaned_tool["function"]

	# Remove strict mode (not supported by Qwen)
	func.pop("strict", None)

	# Clean parameter schema if present
	if "parameters" in func and isinstance(func["parameters"], dict):
	params = func["parameters"]

	# Remove additionalProperties if present
	params.pop("additionalProperties", None)

	# Recursively clean nested properties
	if "properties" in params:
	self._clean_schema_properties(params["properties"])

	cleaned_tools.append(cleaned_tool)

	return cleaned_tools

	def _clean_schema_properties(self, properties: Dict[str, Any]) -> None:
	"""Recursively cleans schema properties."""
	for prop_name, prop_schema in properties.items():
	if isinstance(prop_schema, dict):
	# Remove unsupported fields
	prop_schema.pop("strict", None)
	prop_schema.pop("additionalProperties", None)

	# Recurse into nested properties
	if "properties" in prop_schema:
	self._clean_schema_properties(prop_schema["properties"])

	# Recurse into array items
	if "items" in prop_schema and isinstance(prop_schema["items"], dict):
	self._clean_schema_properties({"item": prop_schema["items"]})

	def _build_request_payload(self, **kwargs) -> Dict[str, Any]:
	"""
	Builds a clean request payload with only supported parameters.
	This prevents 400 Bad Request errors from litellm-internal parameters.
	"""
	# Extract only supported OpenAI parameters
	payload = {k: v for k, v in kwargs.items() if k in SUPPORTED_PARAMS}

	# Always force streaming for internal processing
	payload["stream"] = True

	# Always include usage data in stream
	payload["stream_options"] = {"include_usage": True}

	# Handle tool schema cleaning
	if "tools" in payload and payload["tools"]:
	payload["tools"] = self._clean_tool_schemas(payload["tools"])
	lib_logger.debug(f"Cleaned {len(payload['tools'])} tool schemas")
	elif not payload.get("tools"):
	# Per Qwen Code API bug (see: https://github.com/qianwen-team/flash-dance/issues/2),
	# injecting a dummy tool prevents stream corruption when no tools are provided
	payload["tools"] = [
	{
	"type": "function",
	"function": {
	"name": "do_not_call_me",
	"description": "Do not call this tool.",
	"parameters": {"type": "object", "properties": {}},
	},
	}
	]
	lib_logger.debug(
	"Injected dummy tool to prevent Qwen API stream corruption"
	)

	return payload

	def _convert_chunk_to_openai(self, chunk: Dict[str, Any], model_id: str):
	"""
	Converts a raw Qwen SSE chunk to an OpenAI-compatible chunk.

	CRITICAL FIX: Handle chunks with BOTH usage and choices (final chunk)
	without early return to ensure finish_reason is properly processed.
	"""
	if not isinstance(chunk, dict):
	return

	# Get choices and usage data
	choices = chunk.get("choices", [])
	usage_data = chunk.get("usage")
	chunk_id = chunk.get("id", f"chatcmpl-qwen-{time.time()}")
	chunk_created = chunk.get("created", int(time.time()))

	# Handle chunks with BOTH choices and usage (typical for final chunk)
	# CRITICAL: Process choices FIRST to capture finish_reason, then yield usage
	if choices and usage_data:
	choice = choices[0]
	delta = choice.get("delta", {})
	finish_reason = choice.get("finish_reason")

	# Yield the choice chunk first (contains finish_reason)
	yield {
	"choices": [
	{"index": 0, "delta": delta, "finish_reason": finish_reason}
	],
	"model": model_id,
	"object": "chat.completion.chunk",
	"id": chunk_id,
	"created": chunk_created,
	}
	# Then yield the usage chunk
	yield {
	"choices": [],
	"model": model_id,
	"object": "chat.completion.chunk",
	"id": chunk_id,
	"created": chunk_created,
	"usage": {
	"prompt_tokens": usage_data.get("prompt_tokens", 0),
	"completion_tokens": usage_data.get("completion_tokens", 0),
	"total_tokens": usage_data.get("total_tokens", 0),
	},
	}
	return

	# Handle usage-only chunks
	if usage_data:
	yield {
	"choices": [],
	"model": model_id,
	"object": "chat.completion.chunk",
	"id": chunk_id,
	"created": chunk_created,
	"usage": {
	"prompt_tokens": usage_data.get("prompt_tokens", 0),
	"completion_tokens": usage_data.get("completion_tokens", 0),
	"total_tokens": usage_data.get("total_tokens", 0),
	},
	}
	return

	# Handle content-only chunks
	if not choices:
	return

	choice = choices[0]
	delta = choice.get("delta", {})
	finish_reason = choice.get("finish_reason")

	# Handle <think> tags for reasoning content
	content = delta.get("content")
	if content and ("<think>" in content or "</think>" in content):
	parts = (
	content.replace("<think>", f"\|\|{self.REASONING_START_MARKER}")
	.replace("</think>", f"\|\|/{self.REASONING_START_MARKER}")
	.split("\|\|")
	)
	for part in parts:
	if not part:
	continue

	new_delta = {}
	if part.startswith(self.REASONING_START_MARKER):
	new_delta["reasoning_content"] = part.replace(
	self.REASONING_START_MARKER, ""
	)
	elif part.startswith(f"/{self.REASONING_START_MARKER}"):
	continue
	else:
	new_delta["content"] = part

	yield {
	"choices": [
	{"index": 0, "delta": new_delta, "finish_reason": None}
	],
	"model": model_id,
	"object": "chat.completion.chunk",
	"id": chunk_id,
	"created": chunk_created,
	}
	else:
	# Standard content chunk
	yield {
	"choices": [
	{"index": 0, "delta": delta, "finish_reason": finish_reason}
	],
	"model": model_id,
	"object": "chat.completion.chunk",
	"id": chunk_id,
	"created": chunk_created,
	}

	def _stream_to_completion_response(
	self, chunks: List[litellm.ModelResponse]
	) -> litellm.ModelResponse:
	"""
	Manually reassembles streaming chunks into a complete response.

	Key improvements:
	- Determines finish_reason based on accumulated state (tool_calls vs stop)
	- Properly initializes tool_calls with type field
	- Handles usage data extraction from chunks
	"""
	if not chunks:
	raise ValueError("No chunks provided for reassembly")

	# Initialize the final response structure
	final_message = {"role": "assistant"}
	aggregated_tool_calls = {}
	usage_data = None
	chunk_finish_reason = (
	None # Track finish_reason from chunks (but we'll override)
	)

	# Get the first chunk for basic response metadata
	first_chunk = chunks[0]

	# Process each chunk to aggregate content
	for chunk in chunks:
	if not hasattr(chunk, "choices") or not chunk.choices:
	continue

	choice = chunk.choices[0]
	delta = choice.get("delta", {})

	# Aggregate content
	if "content" in delta and delta["content"] is not None:
	if "content" not in final_message:
	final_message["content"] = ""
	final_message["content"] += delta["content"]

	# Aggregate reasoning content
	if "reasoning_content" in delta and delta["reasoning_content"] is not None:
	if "reasoning_content" not in final_message:
	final_message["reasoning_content"] = ""
	final_message["reasoning_content"] += delta["reasoning_content"]

	# Aggregate tool calls with proper initialization
	if "tool_calls" in delta and delta["tool_calls"]:
	for tc_chunk in delta["tool_calls"]:
	index = tc_chunk.get("index", 0)
	if index not in aggregated_tool_calls:
	# Initialize with type field for OpenAI compatibility
	aggregated_tool_calls[index] = {
	"type": "function",
	"function": {"name": "", "arguments": ""},
	}
	if "id" in tc_chunk:
	aggregated_tool_calls[index]["id"] = tc_chunk["id"]
	if "type" in tc_chunk:
	aggregated_tool_calls[index]["type"] = tc_chunk["type"]
	if "function" in tc_chunk:
	if (
	"name" in tc_chunk["function"]
	and tc_chunk["function"]["name"] is not None
	):
	aggregated_tool_calls[index]["function"]["name"] += (
	tc_chunk["function"]["name"]
	)
	if (
	"arguments" in tc_chunk["function"]
	and tc_chunk["function"]["arguments"] is not None
	):
	aggregated_tool_calls[index]["function"]["arguments"] += (
	tc_chunk["function"]["arguments"]
	)

	# Aggregate function calls (legacy format)
	if "function_call" in delta and delta["function_call"] is not None:
	if "function_call" not in final_message:
	final_message["function_call"] = {"name": "", "arguments": ""}
	if (
	"name" in delta["function_call"]
	and delta["function_call"]["name"] is not None
	):
	final_message["function_call"]["name"] += delta["function_call"][
	"name"
	]
	if (
	"arguments" in delta["function_call"]
	and delta["function_call"]["arguments"] is not None
	):
	final_message["function_call"]["arguments"] += delta[
	"function_call"
	]["arguments"]

	# Track finish_reason from chunks (for reference only)
	if choice.get("finish_reason"):
	chunk_finish_reason = choice["finish_reason"]

	# Handle usage data from the last chunk that has it
	for chunk in reversed(chunks):
	if hasattr(chunk, "usage") and chunk.usage:
	usage_data = chunk.usage
	break

	# Add tool calls to final message if any
	if aggregated_tool_calls:
	final_message["tool_calls"] = list(aggregated_tool_calls.values())

	# Ensure standard fields are present for consistent logging
	for field in ["content", "tool_calls", "function_call"]:
	if field not in final_message:
	final_message[field] = None

	# Determine finish_reason based on accumulated state
	# Priority: tool_calls wins if present, then chunk's finish_reason, then default to "stop"
	if aggregated_tool_calls:
	finish_reason = "tool_calls"
	elif chunk_finish_reason:
	finish_reason = chunk_finish_reason
	else:
	finish_reason = "stop"

	# Construct the final response
	final_choice = {
	"index": 0,
	"message": final_message,
	"finish_reason": finish_reason,
	}

	# Create the final ModelResponse
	final_response_data = {
	"id": first_chunk.id,
	"object": "chat.completion",
	"created": first_chunk.created,
	"model": first_chunk.model,
	"choices": [final_choice],
	"usage": usage_data,
	}

	return litellm.ModelResponse(**final_response_data)

	async def acompletion(
	self, client: httpx.AsyncClient, **kwargs
	) -> Union[litellm.ModelResponse, AsyncGenerator[litellm.ModelResponse, None]]:
	credential_path = kwargs.pop("credential_identifier")
	enable_request_logging = kwargs.pop("enable_request_logging", False)
	model = kwargs["model"]

	# Create dedicated file logger for this request
	file_logger = _QwenCodeFileLogger(
	model_name=model, enabled=enable_request_logging
	)

	async def make_request():
	"""Prepares and makes the actual API call."""
	api_base, access_token = await self.get_api_details(credential_path)

	# Strip provider prefix from model name (e.g., "qwen_code/qwen3-coder-plus" -> "qwen3-coder-plus")
	model_name = model.split("/")[-1]
	kwargs_with_stripped_model = {**kwargs, "model": model_name}

	# Build clean payload with only supported parameters
	payload = self._build_request_payload(**kwargs_with_stripped_model)

	headers = {
	"Authorization": f"Bearer {access_token}",
	"Content-Type": "application/json",
	"Accept": "text/event-stream",
	"User-Agent": "google-api-nodejs-client/9.15.1",
	"X-Goog-Api-Client": "gl-node/22.17.0",
	"Client-Metadata": "ideType=IDE_UNSPECIFIED,platform=PLATFORM_UNSPECIFIED,pluginType=GEMINI",
	}

	url = f"{api_base.rstrip('/')}/v1/chat/completions"

	# Log request to dedicated file
	file_logger.log_request(payload)
	lib_logger.debug(f"Qwen Code Request URL: {url}")

	return client.stream(
	"POST",
	url,
	headers=headers,
	json=payload,
	timeout=TimeoutConfig.streaming(),
	)

	async def stream_handler(response_stream, attempt=1):
	"""Handles the streaming response and converts chunks."""
	try:
	async with response_stream as response:
	# Check for HTTP errors before processing stream
	if response.status_code >= 400:
	error_text = await response.aread()
	error_text = (
	error_text.decode("utf-8")
	if isinstance(error_text, bytes)
	else error_text
	)

	# Handle 401: Force token refresh and retry once
	if response.status_code == 401 and attempt == 1:
	lib_logger.warning(
	"Qwen Code returned 401. Forcing token refresh and retrying once."
	)
	await self._refresh_token(credential_path, force=True)
	retry_stream = await make_request()
	async for chunk in stream_handler(retry_stream, attempt=2):
	yield chunk
	return

	# Handle 429: Rate limit
	elif (
	response.status_code == 429
	or "slow_down" in error_text.lower()
	):
	raise RateLimitError(
	f"Qwen Code rate limit exceeded: {error_text}",
	llm_provider="qwen_code",
	model=model,
	response=response,
	)

	# Handle other errors
	else:
	error_msg = f"Qwen Code HTTP {response.status_code} error: {error_text}"
	file_logger.log_error(error_msg)
	raise httpx.HTTPStatusError(
	f"HTTP {response.status_code}: {error_text}",
	request=response.request,
	response=response,
	)

	# Process successful streaming response
	async for line in response.aiter_lines():
	file_logger.log_response_chunk(line)
	if line.startswith("data: "):
	data_str = line[6:]
	if data_str == "[DONE]":
	break
	try:
	chunk = json.loads(data_str)
	for openai_chunk in self._convert_chunk_to_openai(
	chunk, model
	):
	yield litellm.ModelResponse(**openai_chunk)
	except json.JSONDecodeError:
	lib_logger.warning(
	f"Could not decode JSON from Qwen Code: {line}"
	)

	except httpx.HTTPStatusError:
	raise # Re-raise HTTP errors we already handled
	except Exception as e:
	file_logger.log_error(f"Error during Qwen Code stream processing: {e}")
	lib_logger.error(
	f"Error during Qwen Code stream processing: {e}", exc_info=True
	)
	raise

	async def logging_stream_wrapper():
	"""Wraps the stream to log the final reassembled response."""
	openai_chunks = []
	try:
	async for chunk in stream_handler(await make_request()):
	openai_chunks.append(chunk)
	yield chunk
	finally:
	if openai_chunks:
	final_response = self._stream_to_completion_response(openai_chunks)
	file_logger.log_final_response(final_response.dict())

	if kwargs.get("stream"):
	return logging_stream_wrapper()
	else:

	async def non_stream_wrapper():
	chunks = [chunk async for chunk in logging_stream_wrapper()]
	return self._stream_to_completion_response(chunks)

	return await non_stream_wrapper()