Buckets:

leideng
/

QCFuse

Files

xet

leideng/QCFuse / srt /entrypoints /openai /protocol.py

leideng

15 days ago

download

raw

39.6 kB

	# Copyright 2023-2024 SGLang Team
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	"""Pydantic models for OpenAI API protocol"""

	import logging
	import time
	import uuid
	from dataclasses import dataclass
	from typing import Any, Dict, List, NamedTuple, Optional, Tuple, TypeAlias, Union

	from openai.types.responses import (
	ResponseFunctionToolCall,
	ResponseInputItemParam,
	ResponseOutputItem,
	ResponseOutputMessage,
	ResponseOutputText,
	ResponseReasoningItem,
	)
	from openai.types.responses.response import ToolChoice
	from openai.types.responses.tool import Tool
	from pydantic import (
	BaseModel,
	Field,
	field_validator,
	model_serializer,
	model_validator,
	)
	from typing_extensions import Literal
	from xgrammar import StructuralTag

	from sglang.utils import convert_json_schema_to_str

	logger = logging.getLogger(__name__)

	DEFAULT_MODEL_NAME = "default"


	class ModelCard(BaseModel):
	"""Model cards."""

	id: str
	object: str = "model"
	created: int = Field(default_factory=lambda: int(time.time()))
	owned_by: str = "sglang"
	root: Optional[str] = None
	max_model_len: Optional[int] = None


	class ModelList(BaseModel):
	"""Model list consists of model cards."""

	object: str = "list"
	data: List[ModelCard] = Field(default_factory=list)


	class ErrorResponse(BaseModel):
	object: str = "error"
	message: str
	type: str
	param: Optional[str] = None
	code: int


	class LogProbs(BaseModel):
	text_offset: List[int] = Field(default_factory=list)
	token_logprobs: List[Optional[float]] = Field(default_factory=list)
	tokens: List[str] = Field(default_factory=list)
	top_logprobs: List[Optional[Dict[str, float]]] = Field(default_factory=list)


	class TopLogprob(BaseModel):
	token: str
	bytes: List[int]
	logprob: float


	class ChatCompletionTokenLogprob(BaseModel):
	token: str
	bytes: List[int]
	logprob: float
	top_logprobs: List[TopLogprob]


	class ChoiceLogprobs(BaseModel):
	# build for v1/chat/completions response
	content: List[ChatCompletionTokenLogprob]


	class UsageInfo(BaseModel):
	prompt_tokens: int = 0
	total_tokens: int = 0
	completion_tokens: Optional[int] = 0
	# only used to return cached tokens when --enable-cache-report is set
	prompt_tokens_details: Optional[Dict[str, int]] = None
	reasoning_tokens: Optional[int] = 0


	class StreamOptions(BaseModel):
	include_usage: Optional[bool] = False


	class JsonSchemaResponseFormat(BaseModel):
	name: str
	description: Optional[str] = None
	# use alias to workaround pydantic conflict
	schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
	strict: Optional[bool] = False


	class ResponseFormat(BaseModel):
	type: Literal["text", "json_object", "json_schema"]
	json_schema: Optional[JsonSchemaResponseFormat] = None


	class StructuresResponseFormat(BaseModel):
	begin: str
	schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
	end: str


	# NOTE(dark): keep this for backward compatibility
	class LegacyStructuralTagResponseFormat(BaseModel):
	type: Literal["structural_tag"]
	structures: List[StructuresResponseFormat]
	triggers: List[str]


	StructuralTagResponseFormat: TypeAlias = Union[
	LegacyStructuralTagResponseFormat, StructuralTag
	]

	ToolCallConstraint: TypeAlias = Union[
	Tuple[Literal["structural_tag"], StructuralTagResponseFormat],
	Tuple[Literal["json_schema"], Any], # json_schema can be dict/str/None
	]


	class FileRequest(BaseModel):
	# https://platform.openai.com/docs/api-reference/files/create
	file: bytes # The File object (not file name) to be uploaded
	purpose: str = (
	"batch" # The intended purpose of the uploaded file, default is "batch"
	)


	class FileResponse(BaseModel):
	id: str
	object: str = "file"
	bytes: int
	created_at: int
	filename: str
	purpose: str


	class FileDeleteResponse(BaseModel):
	id: str
	object: str = "file"
	deleted: bool


	class BatchRequest(BaseModel):
	input_file_id: (
	str # The ID of an uploaded file that contains requests for the new batch
	)
	endpoint: str # The endpoint to be used for all requests in the batch
	completion_window: str # The time frame within which the batch should be processed
	metadata: Optional[dict] = None # Optional custom metadata for the batch


	class BatchResponse(BaseModel):
	id: str
	object: str = "batch"
	endpoint: str
	errors: Optional[dict] = None
	input_file_id: str
	completion_window: str
	status: str = "validating"
	output_file_id: Optional[str] = None
	error_file_id: Optional[str] = None
	created_at: int
	in_progress_at: Optional[int] = None
	expires_at: Optional[int] = None
	finalizing_at: Optional[int] = None
	completed_at: Optional[int] = None
	failed_at: Optional[int] = None
	expired_at: Optional[int] = None
	cancelling_at: Optional[int] = None
	cancelled_at: Optional[int] = None
	request_counts: Optional[dict] = None
	metadata: Optional[dict] = None


	class CompletionRequest(BaseModel):
	# Ordered by official OpenAI API documentation
	# https://platform.openai.com/docs/api-reference/completions/create
	model: str = Field(
	default=DEFAULT_MODEL_NAME,
	description="Model name. Supports LoRA adapters via 'base-model:adapter-name' syntax.",
	)
	prompt: Union[List[int], List[List[int]], str, List[str]]
	best_of: Optional[int] = None
	echo: bool = False
	frequency_penalty: float = 0.0
	logit_bias: Optional[Dict[str, float]] = None
	logprobs: Optional[int] = None
	max_tokens: int = 16
	n: int = 1
	presence_penalty: float = 0.0
	seed: Optional[int] = None
	stop: Optional[Union[str, List[str]]] = None
	stream: bool = False
	stream_options: Optional[StreamOptions] = None
	suffix: Optional[str] = None
	temperature: float = 1.0
	top_p: float = 1.0
	user: Optional[str] = None
	return_hidden_states: bool = False

	# Extra parameters for SRT backend only and will be ignored by OpenAI models.
	top_k: int = -1
	min_p: float = 0.0
	min_tokens: int = 0
	json_schema: Optional[str] = None
	regex: Optional[str] = None
	ebnf: Optional[str] = None
	repetition_penalty: float = 1.0
	stop_token_ids: Optional[List[int]] = None
	stop_regex: Optional[Union[str, List[str]]] = None
	no_stop_trim: bool = False
	ignore_eos: bool = False
	skip_special_tokens: bool = True
	lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
	session_params: Optional[Dict] = None
	response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
	custom_params: Optional[Dict] = None
	custom_logit_processor: Optional[str] = None

	# For PD disaggregation
	bootstrap_host: Optional[Union[List[str], str]] = None
	bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
	bootstrap_room: Optional[Union[List[int], int]] = None

	# For request id
	rid: Optional[Union[List[str], str]] = None
	# Extra key for classifying the request (e.g. cache_salt)
	extra_key: Optional[Union[List[str], str]] = None
	# Cache salt for request caching
	cache_salt: Optional[Union[List[str], str]] = None
	# Priority for the request
	priority: Optional[int] = None

	# For custom metric labels
	custom_labels: Optional[Dict[str, str]] = None

	@field_validator("max_tokens")
	@classmethod
	def validate_max_tokens_positive(cls, v):
	if v is not None and v <= 0:
	raise ValueError("max_tokens must be positive")
	return v


	class CompletionResponseChoice(BaseModel):
	index: int
	text: str
	logprobs: Optional[LogProbs] = None
	finish_reason: Optional[Literal["stop", "length", "content_filter", "abort"]] = None
	matched_stop: Union[None, int, str] = None
	hidden_states: Optional[object] = None

	@model_serializer(mode="wrap")
	def _serialize(self, handler):
	data = handler(self)
	if self.hidden_states is None:
	data.pop("hidden_states", None)
	return data


	class CompletionResponse(BaseModel):
	id: str
	object: str = "text_completion"
	created: int = Field(default_factory=lambda: int(time.time()))
	model: str
	choices: List[CompletionResponseChoice]
	usage: UsageInfo
	metadata: Optional[Dict[str, Any]] = None


	class CompletionResponseStreamChoice(BaseModel):
	index: int
	text: str
	logprobs: Optional[LogProbs] = None
	finish_reason: Optional[Literal["stop", "length", "content_filter", "abort"]] = None
	matched_stop: Union[None, int, str] = None
	hidden_states: Optional[object] = None

	@model_serializer(mode="wrap")
	def _serialize(self, handler):
	data = handler(self)
	if self.hidden_states is None:
	data.pop("hidden_states", None)
	return data


	class CompletionStreamResponse(BaseModel):
	id: str
	object: str = "text_completion"
	created: int = Field(default_factory=lambda: int(time.time()))
	model: str
	choices: List[CompletionResponseStreamChoice]
	usage: Optional[UsageInfo] = None


	class ChatCompletionMessageContentTextPart(BaseModel):
	type: Literal["text"]
	text: str


	class ChatCompletionMessageContentImageURL(BaseModel):
	url: str
	detail: Optional[Literal["auto", "low", "high"]] = "auto"


	class ChatCompletionMessageContentVideoURL(BaseModel):
	url: str


	class ChatCompletionMessageContentAudioURL(BaseModel):
	url: str


	class ChatCompletionMessageContentImagePart(BaseModel):
	type: Literal["image_url"]
	image_url: ChatCompletionMessageContentImageURL
	modalities: Optional[Literal["image", "multi-images", "video"]] = "image"


	class ChatCompletionMessageContentVideoPart(BaseModel):
	type: Literal["video_url"]
	video_url: ChatCompletionMessageContentVideoURL


	class ChatCompletionMessageContentAudioPart(BaseModel):
	type: Literal["audio_url"]
	audio_url: ChatCompletionMessageContentAudioURL


	ChatCompletionMessageContentPart = Union[
	ChatCompletionMessageContentTextPart,
	ChatCompletionMessageContentImagePart,
	ChatCompletionMessageContentVideoPart,
	ChatCompletionMessageContentAudioPart,
	]


	class FunctionResponse(BaseModel):
	"""Function response."""

	name: Optional[str] = None
	arguments: Optional[str \| Dict[str, Any]] = None


	class ToolCall(BaseModel):
	"""Tool call response."""

	id: Optional[str] = None
	index: Optional[int] = None
	type: Literal["function"] = "function"
	function: FunctionResponse


	class ChatCompletionMessageGenericParam(BaseModel):
	role: Literal["system", "assistant", "tool", "function"]
	content: Union[str, List[ChatCompletionMessageContentTextPart], None] = Field(
	default=None
	)
	tool_call_id: Optional[str] = None
	name: Optional[str] = None
	reasoning_content: Optional[str] = None
	tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])

	@field_validator("role", mode="before")
	@classmethod
	def _normalize_role(cls, v):
	if isinstance(v, str):
	v_lower = v.lower()
	if v_lower not in {"system", "assistant", "tool", "function"}:
	raise ValueError(
	"'role' must be one of 'system', 'assistant', 'tool', or 'function' (case-insensitive)."
	)
	return v_lower
	raise ValueError("'role' must be a string")


	class ChatCompletionMessageUserParam(BaseModel):
	role: Literal["user"]
	content: Union[str, List[ChatCompletionMessageContentPart]]


	ChatCompletionMessageParam = Union[
	ChatCompletionMessageGenericParam, ChatCompletionMessageUserParam
	]


	class Function(BaseModel):
	"""Function descriptions."""

	description: Optional[str] = Field(default=None, examples=[None])
	name: str
	parameters: Optional[object] = None
	strict: bool = False


	class Tool(BaseModel):
	"""Function wrapper."""

	type: str = Field(default="function", examples=["function"])
	function: Function


	class ToolChoiceFuncName(BaseModel):
	"""The name of tool choice function."""

	name: Optional[str] = None


	class ToolChoice(BaseModel):
	"""The tool choice definition."""

	function: ToolChoiceFuncName
	type: Literal["function"] = Field(default="function", examples=["function"])


	class ChatCompletionRequest(BaseModel):
	# Ordered by official OpenAI API documentation
	# https://platform.openai.com/docs/api-reference/chat/create
	messages: List[ChatCompletionMessageParam]
	model: str = Field(
	default=DEFAULT_MODEL_NAME,
	description="Model name. Supports LoRA adapters via 'base-model:adapter-name' syntax.",
	)
	frequency_penalty: float = 0.0
	logit_bias: Optional[Dict[str, float]] = None
	logprobs: bool = False
	top_logprobs: Optional[int] = None
	max_tokens: Optional[int] = Field(
	default=None,
	deprecated="max_tokens is deprecated in favor of the max_completion_tokens field",
	description="The maximum number of tokens that can be generated in the chat completion. ",
	)
	max_completion_tokens: Optional[int] = Field(
	default=None,
	description="The maximum number of completion tokens for a chat completion request, "
	"including visible output tokens and reasoning tokens. Input tokens are not included. ",
	)
	n: int = 1
	presence_penalty: float = 0.0
	response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
	seed: Optional[int] = None
	stop: Optional[Union[str, List[str]]] = None
	stream: bool = False
	stream_options: Optional[StreamOptions] = None
	temperature: Optional[float] = None
	top_p: Optional[float] = None
	user: Optional[str] = None
	tools: Optional[List[Tool]] = Field(default=None, examples=[None])
	tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field(
	default="auto", examples=["none"]
	) # noqa
	return_hidden_states: bool = False
	reasoning_effort: Optional[Literal["low", "medium", "high"]] = Field(
	default="medium",
	description="Constrains effort on reasoning for reasoning models. "
	"'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
	"result in faster responses and fewer tokens used on reasoning in a response. "
	"Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
	)

	# Extra parameters for SRT backend only and will be ignored by OpenAI models.
	top_k: Optional[int] = None
	min_p: Optional[float] = None
	min_tokens: int = 0
	regex: Optional[str] = None
	ebnf: Optional[str] = None
	repetition_penalty: Optional[float] = None
	stop_token_ids: Optional[List[int]] = None
	stop_regex: Optional[Union[str, List[str]]] = None
	no_stop_trim: bool = False
	ignore_eos: bool = False
	continue_final_message: bool = False
	skip_special_tokens: bool = True
	lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
	session_params: Optional[Dict] = None
	separate_reasoning: bool = True
	stream_reasoning: bool = True
	chat_template_kwargs: Optional[Dict] = None

	# Custom logit processor for advanced sampling control
	custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
	custom_params: Optional[Dict] = None

	# For request id
	rid: Optional[Union[List[str], str]] = None
	# Extra key for classifying the request (e.g. cache_salt)
	extra_key: Optional[Union[List[str], str]] = None
	# Cache salt for request caching
	cache_salt: Optional[Union[List[str], str]] = None
	# Priority for the request
	priority: Optional[int] = None

	# For PD disaggregation
	bootstrap_host: Optional[Union[List[str], str]] = None
	bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
	bootstrap_room: Optional[Union[List[int], int]] = None

	# OpenAI/SGLang default sampling parameters
	_DEFAULT_SAMPLING_PARAMS = {
	"temperature": 1.0,
	"top_p": 1.0,
	"top_k": -1,
	"min_p": 0.0,
	"repetition_penalty": 1.0,
	}

	@model_validator(mode="before")
	@classmethod
	def set_tool_choice_default(cls, values):
	if values.get("tool_choice") is None:
	if values.get("tools") is None:
	values["tool_choice"] = "none"
	else:
	values["tool_choice"] = "auto"
	return values

	@model_validator(mode="before")
	@classmethod
	def normalize_reasoning_inputs(cls, values: Dict):
	r = values.get("reasoning")
	if r is None:
	return values

	if isinstance(r, dict):
	effort = r.get("effort") or r.get("reasoning_effort")
	if effort in {"low", "medium", "high"}:
	values["reasoning_effort"] = effort

	enabled = (
	r.get("enabled")
	if r.get("enabled") is not None
	else r.get("enable", False)
	)
	if isinstance(enabled, str):
	enabled = enabled.strip().lower() in {"1", "true", "yes", "y", "on"}
	if enabled:
	ctk = values.get("chat_template_kwargs")
	if not isinstance(ctk, dict):
	ctk = {}
	ctk.setdefault("thinking", True)
	values["chat_template_kwargs"] = ctk

	return values

	@model_validator(mode="before")
	@classmethod
	def set_json_schema(cls, values):
	response_format = values.get("response_format")
	if not response_format:
	return values

	if response_format.get("type") != "json_schema":
	return values

	schema = response_format.pop("schema", None)
	json_schema = response_format.get("json_schema")

	if json_schema:
	return values

	if schema:
	name_ = schema.get("title", "Schema")
	strict_ = False
	if "properties" in schema and "strict" in schema["properties"]:
	item = schema["properties"].pop("strict", None)
	if item and item.get("default", False):
	strict_ = True

	response_format["json_schema"] = {
	"name": name_,
	"schema": schema,
	"strict": strict_,
	}

	return values

	def to_sampling_params(
	self,
	stop: List[str],
	model_generation_config: Dict[str, Any],
	tool_call_constraint: Optional[ToolCallConstraint] = None,
	) -> Dict[str, Any]:
	"""
	Convert request to sampling parameters.
	Priority: user value > model generation_config > OpenAI defaults
	"""

	def get_param(param_name: str):
	value = getattr(self, param_name)
	if value is None:
	return model_generation_config.get(
	param_name, self._DEFAULT_SAMPLING_PARAMS[param_name]
	)
	return value

	sampling_params = {
	"temperature": get_param("temperature"),
	"max_new_tokens": self.max_tokens or self.max_completion_tokens,
	"min_new_tokens": self.min_tokens,
	"stop": stop,
	"stop_token_ids": self.stop_token_ids,
	"stop_regex": self.stop_regex,
	"top_p": get_param("top_p"),
	"top_k": get_param("top_k"),
	"min_p": get_param("min_p"),
	"presence_penalty": self.presence_penalty,
	"frequency_penalty": self.frequency_penalty,
	"repetition_penalty": get_param("repetition_penalty"),
	"regex": self.regex,
	"ebnf": self.ebnf,
	"n": self.n,
	"no_stop_trim": self.no_stop_trim,
	"ignore_eos": self.ignore_eos,
	"skip_special_tokens": self.skip_special_tokens,
	"logit_bias": self.logit_bias,
	"custom_params": self.custom_params,
	}

	if self.response_format and self.response_format.type == "json_schema":
	sampling_params["json_schema"] = convert_json_schema_to_str(
	self.response_format.json_schema.schema_
	)
	elif self.response_format and self.response_format.type == "json_object":
	sampling_params["json_schema"] = '{"type": "object"}'
	elif self.response_format and self.response_format.type == "structural_tag":
	sampling_params["structural_tag"] = convert_json_schema_to_str(
	self.response_format.model_dump(by_alias=True)
	)

	# Check if there are already existing output constraints
	has_existing_constraints = (
	sampling_params.get("regex")
	or sampling_params.get("ebnf")
	or sampling_params.get("structural_tag")
	or sampling_params.get("json_schema")
	)

	if tool_call_constraint and has_existing_constraints:
	logger.warning("Constrained decoding is not compatible with tool calls.")
	elif tool_call_constraint:
	constraint_type, constraint_value = tool_call_constraint
	if constraint_type == "structural_tag":
	sampling_params[constraint_type] = convert_json_schema_to_str(
	constraint_value.model_dump(by_alias=True)
	)
	elif constraint_type == "json_schema":
	sampling_params[constraint_type] = convert_json_schema_to_str(
	constraint_value # type: ignore
	)
	else:
	sampling_params[constraint_type] = constraint_value

	return sampling_params


	class ChatMessage(BaseModel):
	role: Optional[str] = None
	content: Optional[str] = None
	reasoning_content: Optional[str] = None
	tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])


	class ChatCompletionResponseChoice(BaseModel):
	index: int
	message: ChatMessage
	logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
	finish_reason: Optional[
	Literal[
	"stop", "length", "tool_calls", "content_filter", "function_call", "abort"
	]
	] = None
	matched_stop: Union[None, int, str] = None
	hidden_states: Optional[object] = None

	@model_serializer(mode="wrap")
	def _serialize(self, handler):
	data = handler(self)
	if self.hidden_states is None:
	data.pop("hidden_states", None)
	return data


	class ChatCompletionResponse(BaseModel):
	id: str
	object: str = "chat.completion"
	created: int = Field(default_factory=lambda: int(time.time()))
	model: str
	choices: List[ChatCompletionResponseChoice]
	usage: UsageInfo
	metadata: Optional[Dict[str, Any]] = None


	class DeltaMessage(BaseModel):
	role: Optional[str] = None
	content: Optional[str] = None
	reasoning_content: Optional[str] = None
	tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
	hidden_states: Optional[object] = None

	@model_serializer(mode="wrap")
	def _serialize(self, handler):
	data = handler(self)
	if self.hidden_states is None:
	data.pop("hidden_states", None)
	return data


	class ChatCompletionResponseStreamChoice(BaseModel):
	index: int
	delta: DeltaMessage
	logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
	finish_reason: Optional[
	Literal[
	"stop", "length", "tool_calls", "content_filter", "function_call", "abort"
	]
	] = None
	matched_stop: Union[None, int, str] = None


	class ChatCompletionStreamResponse(BaseModel):
	id: str
	object: str = "chat.completion.chunk"
	created: int = Field(default_factory=lambda: int(time.time()))
	model: str
	choices: List[ChatCompletionResponseStreamChoice]
	usage: Optional[UsageInfo] = None


	class MultimodalEmbeddingInput(BaseModel):
	text: Optional[str] = None
	image: Optional[str] = None


	EmbeddingInput = Union[
	List[int], List[List[int]], str, List[str], List[MultimodalEmbeddingInput]
	]


	class EmbeddingRequest(BaseModel):
	# Ordered by official OpenAI API documentation
	# https://platform.openai.com/docs/api-reference/embeddings/create
	input: EmbeddingInput
	model: str = DEFAULT_MODEL_NAME
	encoding_format: str = "float"
	dimensions: Optional[int] = None
	user: Optional[str] = None

	# The request id.
	rid: Optional[Union[List[str], str]] = None
	# Priority for the request
	priority: Optional[int] = None


	class EmbeddingObject(BaseModel):
	embedding: List[float]
	index: int
	object: str = "embedding"


	ClassifyInput = Union[str, List[str], List[int]]


	class ClassifyRequest(BaseModel):
	# OpenAI-compatible classification request
	model: str = DEFAULT_MODEL_NAME
	input: ClassifyInput
	user: Optional[str] = None

	# The request id.
	rid: Optional[Union[List[str], str]] = None
	# Priority for the request
	priority: Optional[int] = None


	class ClassifyData(BaseModel):
	index: int
	label: str
	probs: List[float]
	num_classes: int


	class ClassifyResponse(BaseModel):
	id: str
	object: str = "list"
	created: int
	model: str
	data: List[ClassifyData]
	usage: UsageInfo


	class EmbeddingResponse(BaseModel):
	data: List[EmbeddingObject]
	model: str
	object: str = "list"
	usage: Optional[UsageInfo] = None


	class ScoringRequest(BaseModel):
	query: Optional[Union[str, List[int]]] = (
	None # Query text or pre-tokenized token IDs
	)
	items: Optional[Union[str, List[str], List[List[int]]]] = (
	None # Item text(s) or pre-tokenized token IDs
	)
	label_token_ids: Optional[List[int]] = (
	None # Token IDs to compute probabilities for
	)
	apply_softmax: bool = False
	item_first: bool = False
	model: str = DEFAULT_MODEL_NAME


	class ScoringResponse(BaseModel):
	scores: List[
	List[float]
	] # List of lists of probabilities, each in the order of label_token_ids
	model: str
	usage: Optional[UsageInfo] = None
	object: str = "scoring"


	class V1RerankReqInput(BaseModel):
	query: str
	documents: List[str]


	class RerankResponse(BaseModel):
	score: float
	document: str
	index: int
	meta_info: Optional[dict] = None


	class TokenizeRequest(BaseModel):
	"""Request schema for the /tokenize endpoint."""

	model: str = DEFAULT_MODEL_NAME
	prompt: Union[str, List[str]]
	add_special_tokens: bool = Field(
	default=True,
	description="whether to add model-specific special tokens (e.g. BOS/EOS) during encoding.",
	)


	class TokenizeResponse(BaseModel):
	"""Response schema for the /tokenize endpoint."""

	tokens: Union[List[int], List[List[int]]]
	count: Union[int, List[int]]
	max_model_len: int


	class DetokenizeRequest(BaseModel):
	"""Request schema for the /detokenize endpoint."""

	model: str = DEFAULT_MODEL_NAME
	tokens: Union[List[int], List[List[int]]]
	skip_special_tokens: bool = Field(
	default=True,
	description="whether to exclude special tokens (e.g. padding or EOS) during decoding.",
	)


	class DetokenizeResponse(BaseModel):
	"""Response schema for the /detokenize endpoint."""

	text: Union[str, List[str]]


	OpenAIServingRequest = Union[
	ChatCompletionRequest,
	CompletionRequest,
	EmbeddingRequest,
	ClassifyRequest,
	ScoringRequest,
	V1RerankReqInput,
	TokenizeRequest,
	DetokenizeRequest,
	]


	# Response API protocol definitions
	class ResponseReasoningParam(BaseModel):
	"""Reasoning parameters for responses."""

	effort: Optional[Literal["low", "medium", "high"]] = Field(
	default="medium",
	description="Constrains effort on reasoning for reasoning models.",
	)


	class ResponseTool(BaseModel):
	"""Tool definition for responses."""

	type: Literal["web_search_preview", "code_interpreter"] = Field(
	description="Type of tool to enable"
	)


	ResponseInputOutputItem: TypeAlias = Union[
	ResponseInputItemParam,
	"ResponseReasoningItem",
	ResponseFunctionToolCall,
	]


	class ResponsesRequest(BaseModel):
	"""Request body for v1/responses endpoint."""

	# Core OpenAI API fields (ordered by official documentation)
	background: Optional[bool] = False
	include: Optional[
	List[
	Literal[
	"code_interpreter_call.outputs",
	"computer_call_output.output.image_url",
	"file_search_call.results",
	"message.input_image.image_url",
	"message.output_text.logprobs",
	"reasoning.encrypted_content",
	]
	]
	] = None
	input: Union[str, List[ResponseInputOutputItem]]
	instructions: Optional[str] = None
	max_output_tokens: Optional[int] = None
	max_tool_calls: Optional[int] = None
	metadata: Optional[Dict[str, Any]] = None
	model: Optional[str] = None # Made optional to match vLLM
	parallel_tool_calls: Optional[bool] = True
	previous_response_id: Optional[str] = None
	reasoning: Optional[ResponseReasoningParam] = None
	service_tier: Literal["auto", "default", "flex", "scale", "priority"] = "auto"
	store: Optional[bool] = True
	stream: Optional[bool] = False
	temperature: Optional[float] = None
	tool_choice: Literal["auto", "required", "none"] = "auto"
	tools: List[ResponseTool] = Field(default_factory=list)
	top_logprobs: Optional[int] = 0
	top_p: Optional[float] = None
	truncation: Optional[Literal["auto", "disabled"]] = "disabled"
	user: Optional[str] = None

	# Extra SGLang parameters
	request_id: str = Field(
	default_factory=lambda: f"resp_{uuid.uuid4().hex}",
	description="The request_id related to this request. If the caller does not set it, a random uuid will be generated.",
	)
	priority: int = Field(default=0, description="Request priority")
	extra_key: Optional[str] = Field(
	default=None,
	description="Extra key for classifying the request (e.g. cache_salt)",
	)
	cache_salt: Optional[str] = Field(
	default=None, description="Cache salt for request caching"
	)

	# SGLang-specific sampling parameters
	frequency_penalty: float = 0.0
	presence_penalty: float = 0.0
	stop: Optional[Union[str, List[str]]] = None
	top_k: int = -1
	min_p: float = 0.0
	repetition_penalty: float = 1.0

	# Default sampling parameters
	_DEFAULT_SAMPLING_PARAMS = {
	"temperature": 0.7,
	"top_p": 1.0,
	"top_k": -1,
	"min_p": 0.0,
	"repetition_penalty": 1.0,
	}

	def to_sampling_params(
	self, default_max_tokens: int, default_params: Optional[Dict] = None
	) -> Dict[str, Any]:
	"""Convert to sampling parameters for generation."""
	if default_params is None:
	default_params = {}

	# Use max_output_tokens if available, otherwise use max_tokens for backwards compatibility
	if self.max_output_tokens is not None:
	max_tokens = min(self.max_output_tokens, default_max_tokens)
	else:
	max_tokens = default_max_tokens

	# Avoid exceed the context length by minus 2 token
	max_tokens -= 2

	# Get parameters with defaults
	temperature = self.temperature
	if temperature is None:
	temperature = default_params.get(
	"temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
	)

	top_p = self.top_p
	if top_p is None:
	top_p = default_params.get("top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])

	params = {
	"max_new_tokens": max_tokens,
	"temperature": temperature,
	"top_p": top_p,
	"frequency_penalty": self.frequency_penalty,
	"presence_penalty": self.presence_penalty,
	"stop": self.stop,
	"top_k": self.top_k,
	"min_p": self.min_p,
	"repetition_penalty": self.repetition_penalty,
	}

	# Apply any additional default parameters
	for key, value in default_params.items():
	if key not in params or params[key] is None:
	params[key] = value

	return params


	class PromptTokenUsageInfo(BaseModel):
	"""Prompt token usage details."""

	cached_tokens: int = 0


	class ResponsesResponse(BaseModel):
	"""Response body for v1/responses endpoint."""

	id: str = Field(default_factory=lambda: f"resp_{time.time()}")
	object: Literal["response"] = "response"
	created_at: int = Field(default_factory=lambda: int(time.time()))
	model: str

	output: List[
	Union[ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall]
	] = Field(default_factory=list)
	status: Literal["queued", "in_progress", "completed", "failed", "cancelled"]
	usage: Optional[UsageInfo] = None
	parallel_tool_calls: bool = True
	tool_choice: str = "auto"
	tools: List[ResponseTool] = Field(default_factory=list)

	# OpenAI compatibility fields. not all are used at the moment.
	# Recommend checking https://platform.openai.com/docs/api-reference/responses
	error: Optional[dict] = None
	incomplete_details: Optional[dict] = None # TODO(v) support this input
	instructions: Optional[str] = None
	max_output_tokens: Optional[int] = None
	previous_response_id: Optional[str] = None
	reasoning: Optional[dict] = (
	# Unused. No model supports this. For GPT-oss, system prompt sets
	# the field, not server args.
	None # {"effort": Optional[str], "summary": Optional[str]}
	)
	store: Optional[bool] = None
	temperature: Optional[float] = None
	text: Optional[dict] = None # e.g. {"format": {"type": "text"}}
	top_p: Optional[float] = None
	truncation: Optional[str] = None
	user: Optional[str] = None
	metadata: Optional[Dict[str, Any]] = None

	@classmethod
	def from_request(
	cls,
	request: ResponsesRequest,
	sampling_params: Any,
	model_name: str,
	created_time: int,
	output: List[
	Union[ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall]
	],
	status: str,
	usage: Optional[UsageInfo],
	) -> "ResponsesResponse":
	"""Create a response from a request."""

	# Determine if the output is plain text only to set text.format
	def _is_text_only(
	items: List[
	Union[
	ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall
	]
	],
	) -> bool:
	if not items:
	return False
	for it in items:
	# tool call -> not pure text.
	if isinstance(it, ResponseReasoningItem) or isinstance(
	it, ResponseFunctionToolCall
	):
	return False
	try:
	if isinstance(it, ResponseOutputText):
	continue
	elif isinstance(it, ResponseOutputMessage):
	if not it.content:
	continue
	for c in it.content:
	if not isinstance(c, ResponseOutputText):
	return False
	else:
	# Unknown type, not considered text-only
	return False
	except AttributeError:
	return False
	return True

	text_format = {"format": {"type": "text"}} if _is_text_only(output) else None

	return cls(
	id=request.request_id,
	created_at=created_time,
	model=model_name,
	output=output,
	status=status,
	usage=usage,
	parallel_tool_calls=request.parallel_tool_calls or True,
	tool_choice=request.tool_choice,
	tools=request.tools,
	# fields for parity with v1/responses
	error=None,
	incomplete_details=None,
	instructions=request.instructions,
	max_output_tokens=request.max_output_tokens,
	previous_response_id=request.previous_response_id, # TODO(v): ensure this is propagated if retrieved from store
	reasoning={
	"effort": request.reasoning.effort if request.reasoning else None,
	"summary": None, # unused
	},
	store=request.store,
	temperature=request.temperature,
	text=text_format, # TODO(v): Expand coverage per https://platform.openai.com/docs/api-reference/responses/list
	top_p=request.top_p,
	truncation=request.truncation,
	user=request.user,
	metadata=request.metadata or {},
	)


	class RequestResponseMetadata(BaseModel):
	"""Metadata for request/response tracking."""

	request_id: str
	final_usage_info: Optional[UsageInfo] = None


	@dataclass
	class MessageProcessingResult:
	"""Result of processing chat messages and applying templates.

	This dataclass encapsulates all the outputs from message processing including
	prompt generation, multimodal data extraction, and constraint preparation.
	Used internally by OpenAIServingChat to pass processed data between methods.

	Args:
	prompt: The final text prompt after applying chat template
	prompt_ids: Either the text prompt (str) or tokenized IDs (List[int])
	image_data: Extracted image data from messages, if any
	audio_data: Extracted audio data from messages, if any
	modalities: List of modality types present in the messages
	stop: Combined stop strings from template and request
	tool_call_constraint: Optional constraint for structured tool calls
	"""

	prompt: str
	prompt_ids: Union[str, List[int]]
	image_data: Optional[Any]
	audio_data: Optional[Any]
	video_data: Optional[Any]
	modalities: List[str]
	stop: List[str]
	tool_call_constraint: Optional[ToolCallConstraint] = None


	class ToolCallProcessingResult(NamedTuple):
	"""Result of processing tool calls in a response."""

	tool_calls: Optional[
	List[Any]
	] # List of ToolCall objects or None if parsing failed
	remaining_text: str # Text remaining after parsing tool calls
	finish_reason: Dict[str, Any] # Updated finish reason dictionary


	class ResponseReasoningTextContent(BaseModel):
	text: str
	type: Literal["reasoning_text"] = "reasoning_text"


	ResponseInputOutputItem: TypeAlias = Union[
	ResponseInputItemParam, "ResponseReasoningItem", ResponseFunctionToolCall
	]

Xet Storage Details

Size:: 39.6 kB
Xet hash:: 652c0c1dbc529c09c6c357d5b87566a08054485b073cbf35d470c0266d723c1f

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.