|
|
""" |
|
|
Executor bindings |
|
|
""" |
|
|
from __future__ import annotations |
|
|
import datetime |
|
|
import os |
|
|
import torch |
|
|
import typing |
|
|
__all__ = ['BatchingType', 'CapacitySchedulerPolicy', 'CommunicationMode', 'CommunicationType', 'ContextChunkingPolicy', 'DecodingConfig', 'DecodingMode', 'Executor', 'ExecutorConfig', 'ExternalDraftTokensConfig', 'InflightBatchingStats', 'IterationStats', 'KvCacheConfig', 'KvCacheStats', 'LookaheadDecodingConfig', 'LoraConfig', 'ModelType', 'OrchestratorConfig', 'OutputConfig', 'ParallelConfig', 'PeftCacheConfig', 'PromptTuningConfig', 'Request', 'RequestStage', 'RequestStats', 'RequestStatsPerIteration', 'Response', 'Result', 'SamplingConfig', 'SchedulerConfig', 'StaticBatchingStats'] |
|
|
class BatchingType: |
|
|
""" |
|
|
Members: |
|
|
|
|
|
STATIC |
|
|
|
|
|
INFLIGHT |
|
|
""" |
|
|
INFLIGHT: typing.ClassVar[BatchingType] |
|
|
STATIC: typing.ClassVar[BatchingType] |
|
|
__members__: typing.ClassVar[dict[str, BatchingType]] |
|
|
def __eq__(self, other: typing.Any) -> bool: |
|
|
... |
|
|
def __getstate__(self) -> int: |
|
|
... |
|
|
def __hash__(self) -> int: |
|
|
... |
|
|
def __index__(self) -> int: |
|
|
... |
|
|
def __init__(self, value: int) -> None: |
|
|
... |
|
|
def __int__(self) -> int: |
|
|
... |
|
|
def __ne__(self, other: typing.Any) -> bool: |
|
|
... |
|
|
def __repr__(self) -> str: |
|
|
... |
|
|
def __setstate__(self, state: int) -> None: |
|
|
... |
|
|
def __str__(self) -> str: |
|
|
... |
|
|
@property |
|
|
def name(self) -> str: |
|
|
... |
|
|
@property |
|
|
def value(self) -> int: |
|
|
... |
|
|
class CapacitySchedulerPolicy: |
|
|
""" |
|
|
Members: |
|
|
|
|
|
MAX_UTILIZATION |
|
|
|
|
|
GUARANTEED_NO_EVICT |
|
|
""" |
|
|
GUARANTEED_NO_EVICT: typing.ClassVar[CapacitySchedulerPolicy] |
|
|
MAX_UTILIZATION: typing.ClassVar[CapacitySchedulerPolicy] |
|
|
__members__: typing.ClassVar[dict[str, CapacitySchedulerPolicy]] |
|
|
def __eq__(self, other: typing.Any) -> bool: |
|
|
... |
|
|
def __getstate__(self) -> int: |
|
|
... |
|
|
def __hash__(self) -> int: |
|
|
... |
|
|
def __index__(self) -> int: |
|
|
... |
|
|
def __init__(self, value: int) -> None: |
|
|
... |
|
|
def __int__(self) -> int: |
|
|
... |
|
|
def __ne__(self, other: typing.Any) -> bool: |
|
|
... |
|
|
def __repr__(self) -> str: |
|
|
... |
|
|
def __setstate__(self, state: int) -> None: |
|
|
... |
|
|
def __str__(self) -> str: |
|
|
... |
|
|
@property |
|
|
def name(self) -> str: |
|
|
... |
|
|
@property |
|
|
def value(self) -> int: |
|
|
... |
|
|
class CommunicationMode: |
|
|
""" |
|
|
Members: |
|
|
|
|
|
LEADER |
|
|
|
|
|
ORCHESTRATOR |
|
|
""" |
|
|
LEADER: typing.ClassVar[CommunicationMode] |
|
|
ORCHESTRATOR: typing.ClassVar[CommunicationMode] |
|
|
__members__: typing.ClassVar[dict[str, CommunicationMode]] |
|
|
def __eq__(self, other: typing.Any) -> bool: |
|
|
... |
|
|
def __getstate__(self) -> int: |
|
|
... |
|
|
def __hash__(self) -> int: |
|
|
... |
|
|
def __index__(self) -> int: |
|
|
... |
|
|
def __init__(self, value: int) -> None: |
|
|
... |
|
|
def __int__(self) -> int: |
|
|
... |
|
|
def __ne__(self, other: typing.Any) -> bool: |
|
|
... |
|
|
def __repr__(self) -> str: |
|
|
... |
|
|
def __setstate__(self, state: int) -> None: |
|
|
... |
|
|
def __str__(self) -> str: |
|
|
... |
|
|
@property |
|
|
def name(self) -> str: |
|
|
... |
|
|
@property |
|
|
def value(self) -> int: |
|
|
... |
|
|
class CommunicationType: |
|
|
""" |
|
|
Members: |
|
|
|
|
|
MPI |
|
|
""" |
|
|
MPI: typing.ClassVar[CommunicationType] |
|
|
__members__: typing.ClassVar[dict[str, CommunicationType]] |
|
|
def __eq__(self, other: typing.Any) -> bool: |
|
|
... |
|
|
def __getstate__(self) -> int: |
|
|
... |
|
|
def __hash__(self) -> int: |
|
|
... |
|
|
def __index__(self) -> int: |
|
|
... |
|
|
def __init__(self, value: int) -> None: |
|
|
... |
|
|
def __int__(self) -> int: |
|
|
... |
|
|
def __ne__(self, other: typing.Any) -> bool: |
|
|
... |
|
|
def __repr__(self) -> str: |
|
|
... |
|
|
def __setstate__(self, state: int) -> None: |
|
|
... |
|
|
def __str__(self) -> str: |
|
|
... |
|
|
@property |
|
|
def name(self) -> str: |
|
|
... |
|
|
@property |
|
|
def value(self) -> int: |
|
|
... |
|
|
class ContextChunkingPolicy: |
|
|
""" |
|
|
Members: |
|
|
|
|
|
EQUAL_PROGRESS |
|
|
|
|
|
FIRST_COME_FIRST_SERVED |
|
|
""" |
|
|
EQUAL_PROGRESS: typing.ClassVar[ContextChunkingPolicy] |
|
|
FIRST_COME_FIRST_SERVED: typing.ClassVar[ContextChunkingPolicy] |
|
|
__members__: typing.ClassVar[dict[str, ContextChunkingPolicy]] |
|
|
def __eq__(self, other: typing.Any) -> bool: |
|
|
... |
|
|
def __getstate__(self) -> int: |
|
|
... |
|
|
def __hash__(self) -> int: |
|
|
... |
|
|
def __index__(self) -> int: |
|
|
... |
|
|
def __init__(self, value: int) -> None: |
|
|
... |
|
|
def __int__(self) -> int: |
|
|
... |
|
|
def __ne__(self, other: typing.Any) -> bool: |
|
|
... |
|
|
def __repr__(self) -> str: |
|
|
... |
|
|
def __setstate__(self, state: int) -> None: |
|
|
... |
|
|
def __str__(self) -> str: |
|
|
... |
|
|
@property |
|
|
def name(self) -> str: |
|
|
... |
|
|
@property |
|
|
def value(self) -> int: |
|
|
... |
|
|
class DecodingConfig: |
|
|
def __init__(self, decoding_mode: DecodingMode | None = None, lookahead_decoding_config: LookaheadDecodingConfig | None = None, medusa_choices: list[list[int]] | None = None) -> None: |
|
|
... |
|
|
@property |
|
|
def decoding_mode(self) -> DecodingMode | None: |
|
|
... |
|
|
@decoding_mode.setter |
|
|
def decoding_mode(self, arg1: DecodingMode) -> None: |
|
|
... |
|
|
@property |
|
|
def lookahead_decoding_config(self) -> LookaheadDecodingConfig | None: |
|
|
... |
|
|
@lookahead_decoding_config.setter |
|
|
def lookahead_decoding_config(self, arg1: LookaheadDecodingConfig) -> None: |
|
|
... |
|
|
@property |
|
|
def medusa_choices(self) -> list[list[int]] | None: |
|
|
... |
|
|
@medusa_choices.setter |
|
|
def medusa_choices(self, arg1: list[list[int]]) -> None: |
|
|
... |
|
|
class DecodingMode: |
|
|
@staticmethod |
|
|
def Auto() -> DecodingMode: |
|
|
... |
|
|
@staticmethod |
|
|
def BeamSearch() -> DecodingMode: |
|
|
... |
|
|
@staticmethod |
|
|
def Lookahead() -> DecodingMode: |
|
|
... |
|
|
@staticmethod |
|
|
def Medusa() -> DecodingMode: |
|
|
... |
|
|
@staticmethod |
|
|
def TopK() -> DecodingMode: |
|
|
... |
|
|
@staticmethod |
|
|
def TopKTopP() -> DecodingMode: |
|
|
... |
|
|
@staticmethod |
|
|
def TopP() -> DecodingMode: |
|
|
... |
|
|
def isAuto(self) -> bool: |
|
|
... |
|
|
def isBeamSearch(self) -> bool: |
|
|
... |
|
|
def isLookahead(self) -> bool: |
|
|
... |
|
|
def isMedusa(self) -> bool: |
|
|
... |
|
|
def isTopK(self) -> bool: |
|
|
... |
|
|
def isTopKandTopP(self) -> bool: |
|
|
... |
|
|
def isTopKorTopP(self) -> bool: |
|
|
... |
|
|
def isTopP(self) -> bool: |
|
|
... |
|
|
class Executor: |
|
|
def __enter__(self) -> typing.Any: |
|
|
... |
|
|
def __exit__(self, arg0: typing.Any, arg1: typing.Any, arg2: typing.Any) -> None: |
|
|
... |
|
|
@typing.overload |
|
|
def __init__(self, model_path: os.PathLike, model_type: ModelType, executor_config: ExecutorConfig) -> None: |
|
|
... |
|
|
@typing.overload |
|
|
def __init__(self, encoder_model_path: os.PathLike, decoder_model_path: os.PathLike, model_type: ModelType, executor_config: ExecutorConfig) -> None: |
|
|
... |
|
|
@typing.overload |
|
|
def __init__(self, engine_buffer: str, json_config_str: str, model_type: ModelType, executor_config: ExecutorConfig) -> None: |
|
|
... |
|
|
@typing.overload |
|
|
def __init__(self, encoder_engine_buffer: str, encoder_json_config_str: str, decoder_engine_buffer: str, decoder_json_config_str: str, model_type: ModelType, executor_config: ExecutorConfig) -> None: |
|
|
... |
|
|
@typing.overload |
|
|
def await_responses(self, timeout: datetime.timedelta | None = None) -> list[Response]: |
|
|
... |
|
|
@typing.overload |
|
|
def await_responses(self, id: int, timeout: datetime.timedelta | None = None) -> list[Response]: |
|
|
... |
|
|
@typing.overload |
|
|
def await_responses(self, ids: list[int], timeout: datetime.timedelta | None = None) -> list[list[Response]]: |
|
|
... |
|
|
def can_enqueue_requests(self) -> bool: |
|
|
... |
|
|
def cancel_request(self, id: int = None) -> None: |
|
|
... |
|
|
def enqueue_request(self, request: Request) -> int: |
|
|
... |
|
|
def enqueue_requests(self, requests: list[Request]) -> list[int]: |
|
|
... |
|
|
def get_latest_iteration_stats(self) -> list[IterationStats]: |
|
|
... |
|
|
def get_latest_request_stats(self) -> list[RequestStatsPerIteration]: |
|
|
... |
|
|
def get_num_responses_ready(self, id: int | None = None) -> int: |
|
|
... |
|
|
def shutdown(self) -> None: |
|
|
... |
|
|
class ExecutorConfig: |
|
|
batching_type: BatchingType |
|
|
enable_chunked_context: bool |
|
|
gpu_weights_percent: float |
|
|
iter_stats_max_iterations: int |
|
|
kv_cache_config: KvCacheConfig |
|
|
max_beam_width: int |
|
|
max_queue_size: int | None |
|
|
multi_block_mode: bool |
|
|
normalize_log_probs: bool |
|
|
request_stats_max_iterations: int |
|
|
scheduler_config: SchedulerConfig |
|
|
def __getstate__(self) -> tuple: |
|
|
... |
|
|
def __init__(self, max_beam_width: int = 1, scheduler_config: SchedulerConfig = ..., kv_cache_config: KvCacheConfig = ..., enable_chunked_context: bool = False, normalize_log_probs: bool = True, iter_stats_max_iterations: int = 1000, request_stats_max_iterations: int = 0, batching_type: BatchingType = ..., max_batch_size: int | None = None, max_num_tokens: int | None = None, parallel_config: ParallelConfig | None = None, peft_cache_config: PeftCacheConfig = ..., logits_post_processor_map: dict[str, typing.Callable[[int, torch.Tensor, list[list[int]], int, int | None], None]] | None = None, logits_post_processor_batched: typing.Callable[[list[int], list[torch.Tensor], list[list[list[int]]], int, list[int | None]], None] | None = None, decoding_config: DecodingConfig | None = None, gpu_weights_percent: float = 1.0, max_queue_size: int | None = None, multi_block_mode: bool = False) -> None: |
|
|
... |
|
|
def __setstate__(self, arg0: tuple) -> None: |
|
|
... |
|
|
@property |
|
|
def decoding_config(self) -> DecodingConfig | None: |
|
|
... |
|
|
@decoding_config.setter |
|
|
def decoding_config(self, arg1: DecodingConfig) -> None: |
|
|
... |
|
|
@property |
|
|
def logits_post_processor_batched(self) -> typing.Callable[[list[int], list[torch.Tensor], list[list[list[int]]], int, list[int | None]], None] | None: |
|
|
... |
|
|
@logits_post_processor_batched.setter |
|
|
def logits_post_processor_batched(self, arg1: typing.Callable[[list[int], list[torch.Tensor], list[list[list[int]]], int, list[int | None]], None]) -> None: |
|
|
... |
|
|
@property |
|
|
def logits_post_processor_map(self) -> dict[str, typing.Callable[[int, torch.Tensor, list[list[int]], int, int | None], None]] | None: |
|
|
... |
|
|
@logits_post_processor_map.setter |
|
|
def logits_post_processor_map(self, arg1: dict[str, typing.Callable[[int, torch.Tensor, list[list[int]], int, int | None], None]]) -> None: |
|
|
... |
|
|
@property |
|
|
def max_batch_size(self) -> int | None: |
|
|
... |
|
|
@max_batch_size.setter |
|
|
def max_batch_size(self, arg1: int) -> None: |
|
|
... |
|
|
@property |
|
|
def max_num_tokens(self) -> int | None: |
|
|
... |
|
|
@max_num_tokens.setter |
|
|
def max_num_tokens(self, arg1: int) -> None: |
|
|
... |
|
|
@property |
|
|
def parallel_config(self) -> ParallelConfig | None: |
|
|
... |
|
|
@parallel_config.setter |
|
|
def parallel_config(self, arg1: ParallelConfig) -> None: |
|
|
... |
|
|
@property |
|
|
def peft_cache_config(self) -> PeftCacheConfig | None: |
|
|
... |
|
|
@peft_cache_config.setter |
|
|
def peft_cache_config(self, arg1: PeftCacheConfig) -> None: |
|
|
... |
|
|
class ExternalDraftTokensConfig: |
|
|
def __init__(self, tokens: list[int], logits: torch.Tensor | None = None, acceptance_threshold: float | None = None) -> None: |
|
|
... |
|
|
@property |
|
|
def acceptance_threshold(self) -> float | None: |
|
|
... |
|
|
@property |
|
|
def logits(self) -> torch.Tensor | None: |
|
|
... |
|
|
@property |
|
|
def tokens(self) -> list[int]: |
|
|
... |
|
|
class InflightBatchingStats: |
|
|
avg_num_decoded_tokens_per_iter: float |
|
|
micro_batch_id: int |
|
|
num_context_requests: int |
|
|
num_ctx_tokens: int |
|
|
num_gen_requests: int |
|
|
num_paused_requests: int |
|
|
num_scheduled_requests: int |
|
|
def __init__(self) -> None: |
|
|
... |
|
|
class IterationStats: |
|
|
cpu_mem_usage: int |
|
|
gpu_mem_usage: int |
|
|
inflight_batching_stats: InflightBatchingStats | None |
|
|
iter: int |
|
|
iter_latency_ms: float |
|
|
kv_cache_stats: KvCacheStats | None |
|
|
max_num_active_requests: int |
|
|
num_active_requests: int |
|
|
num_queued_requests: int |
|
|
pinned_mem_usage: int |
|
|
static_batching_stats: StaticBatchingStats | None |
|
|
timestamp: str |
|
|
def __init__(self) -> None: |
|
|
... |
|
|
def to_json_str(self) -> str: |
|
|
... |
|
|
class KvCacheConfig: |
|
|
enable_block_reuse: bool |
|
|
onboard_blocks: bool |
|
|
def __getstate__(self) -> tuple: |
|
|
... |
|
|
def __init__(self, enable_block_reuse: bool = False, max_tokens: int | None = None, max_attention_window: int | None = None, sink_token_length: int | None = None, free_gpu_memory_fraction: float | None = None, host_cache_size: int | None = None, onboard_blocks: bool = True) -> None: |
|
|
... |
|
|
def __setstate__(self, arg0: tuple) -> None: |
|
|
... |
|
|
@property |
|
|
def free_gpu_memory_fraction(self) -> float | None: |
|
|
... |
|
|
@free_gpu_memory_fraction.setter |
|
|
def free_gpu_memory_fraction(self, arg1: float) -> None: |
|
|
... |
|
|
@property |
|
|
def host_cache_size(self) -> int | None: |
|
|
... |
|
|
@host_cache_size.setter |
|
|
def host_cache_size(self, arg1: int) -> None: |
|
|
... |
|
|
@property |
|
|
def max_attention_window(self) -> int | None: |
|
|
... |
|
|
@max_attention_window.setter |
|
|
def max_attention_window(self, arg1: int) -> None: |
|
|
... |
|
|
@property |
|
|
def max_tokens(self) -> int | None: |
|
|
... |
|
|
@max_tokens.setter |
|
|
def max_tokens(self, arg1: int) -> None: |
|
|
... |
|
|
@property |
|
|
def sink_token_length(self) -> int | None: |
|
|
... |
|
|
@sink_token_length.setter |
|
|
def sink_token_length(self, arg1: int) -> None: |
|
|
... |
|
|
class KvCacheStats: |
|
|
alloc_new_blocks: int |
|
|
alloc_total_blocks: int |
|
|
free_num_blocks: int |
|
|
max_num_blocks: int |
|
|
reused_blocks: int |
|
|
tokens_per_block: int |
|
|
used_num_blocks: int |
|
|
def __init__(self) -> None: |
|
|
... |
|
|
class LookaheadDecodingConfig: |
|
|
def __init__(self, max_window_size: int, max_ngram_size: int, max_verification_set_size: int) -> None: |
|
|
... |
|
|
@property |
|
|
def max_ngram_size(self) -> int: |
|
|
... |
|
|
@property |
|
|
def max_verification_set_size(self) -> int: |
|
|
... |
|
|
@property |
|
|
def max_window_size(self) -> int: |
|
|
... |
|
|
class LoraConfig: |
|
|
def __init__(self, task_id: int, weights: torch.Tensor | None = None, config: torch.Tensor | None = None) -> None: |
|
|
... |
|
|
@property |
|
|
def config(self) -> torch.Tensor | None: |
|
|
... |
|
|
@property |
|
|
def task_id(self) -> int: |
|
|
... |
|
|
@property |
|
|
def weights(self) -> torch.Tensor | None: |
|
|
... |
|
|
class ModelType: |
|
|
""" |
|
|
Members: |
|
|
|
|
|
DECODER_ONLY |
|
|
|
|
|
ENCODER_ONLY |
|
|
|
|
|
ENCODER_DECODER |
|
|
""" |
|
|
DECODER_ONLY: typing.ClassVar[ModelType] |
|
|
ENCODER_DECODER: typing.ClassVar[ModelType] |
|
|
ENCODER_ONLY: typing.ClassVar[ModelType] |
|
|
__members__: typing.ClassVar[dict[str, ModelType]] |
|
|
def __eq__(self, other: typing.Any) -> bool: |
|
|
... |
|
|
def __getstate__(self) -> int: |
|
|
... |
|
|
def __hash__(self) -> int: |
|
|
... |
|
|
def __index__(self) -> int: |
|
|
... |
|
|
def __init__(self, value: int) -> None: |
|
|
... |
|
|
def __int__(self) -> int: |
|
|
... |
|
|
def __ne__(self, other: typing.Any) -> bool: |
|
|
... |
|
|
def __repr__(self) -> str: |
|
|
... |
|
|
def __setstate__(self, state: int) -> None: |
|
|
... |
|
|
def __str__(self) -> str: |
|
|
... |
|
|
@property |
|
|
def name(self) -> str: |
|
|
... |
|
|
@property |
|
|
def value(self) -> int: |
|
|
... |
|
|
class OrchestratorConfig: |
|
|
is_orchestrator: bool |
|
|
worker_executable_path: str |
|
|
def __init__(self, is_orchestrator: bool = True, worker_executable_path: str = '') -> None: |
|
|
... |
|
|
class OutputConfig: |
|
|
exclude_input_from_output: bool |
|
|
return_context_logits: bool |
|
|
return_encoder_output: bool |
|
|
return_generation_logits: bool |
|
|
return_log_probs: bool |
|
|
def __init__(self, return_log_probs: bool = False, return_context_logits: bool = False, return_generation_logits: bool = False, exclude_input_from_output: bool = False, return_encoder_output: bool = False) -> None: |
|
|
... |
|
|
class ParallelConfig: |
|
|
communication_mode: CommunicationMode |
|
|
communication_type: CommunicationType |
|
|
def __getstate__(self) -> tuple: |
|
|
... |
|
|
def __init__(self, communication_type: CommunicationType = ..., communication_mode: CommunicationMode = ..., device_ids: list[int] | None = None, participant_ids: list[int] | None = None, orchestrator_config: OrchestratorConfig | None = None) -> None: |
|
|
... |
|
|
def __setstate__(self, arg0: tuple) -> None: |
|
|
... |
|
|
@property |
|
|
def device_ids(self) -> list[int] | None: |
|
|
... |
|
|
@device_ids.setter |
|
|
def device_ids(self, arg1: list[int]) -> None: |
|
|
... |
|
|
@property |
|
|
def orchestrator_config(self) -> OrchestratorConfig | None: |
|
|
... |
|
|
@orchestrator_config.setter |
|
|
def orchestrator_config(self, arg1: OrchestratorConfig) -> None: |
|
|
... |
|
|
@property |
|
|
def participant_ids(self) -> list[int] | None: |
|
|
... |
|
|
@participant_ids.setter |
|
|
def participant_ids(self, arg1: list[int]) -> None: |
|
|
... |
|
|
class PeftCacheConfig: |
|
|
def __getstate__(self) -> tuple: |
|
|
... |
|
|
def __init__(self, num_host_module_layer: int = 0, num_device_module_layer: int = 0, optimal_adapter_size: int = 8, max_adapter_size: int = 64, num_put_workers: int = 1, num_ensure_workers: int = 1, num_copy_streams: int = 1, max_pages_per_block_host: int = 24, max_pages_per_block_device: int = 8, device_cache_percent: float | None = None, host_cache_size: int | None = None) -> None: |
|
|
... |
|
|
def __setstate__(self, arg0: tuple) -> None: |
|
|
... |
|
|
@property |
|
|
def device_cache_percent(self) -> float | None: |
|
|
... |
|
|
@property |
|
|
def host_cache_size(self) -> int | None: |
|
|
... |
|
|
@property |
|
|
def max_adapter_size(self) -> int: |
|
|
... |
|
|
@property |
|
|
def max_pages_per_block_device(self) -> int: |
|
|
... |
|
|
@property |
|
|
def max_pages_per_block_host(self) -> int: |
|
|
... |
|
|
@property |
|
|
def num_copy_streams(self) -> int: |
|
|
... |
|
|
@property |
|
|
def num_device_module_layer(self) -> int: |
|
|
... |
|
|
@property |
|
|
def num_ensure_workers(self) -> int: |
|
|
... |
|
|
@property |
|
|
def num_host_module_layer(self) -> int: |
|
|
... |
|
|
@property |
|
|
def num_put_workers(self) -> int: |
|
|
... |
|
|
@property |
|
|
def optimal_adapter_size(self) -> int: |
|
|
... |
|
|
class PromptTuningConfig: |
|
|
def __init__(self, embedding_table: torch.Tensor) -> None: |
|
|
... |
|
|
@property |
|
|
def embedding_table(self) -> torch.Tensor: |
|
|
... |
|
|
class Request: |
|
|
BATCHED_POST_PROCESSOR_NAME: typing.ClassVar[str] = 'batched' |
|
|
output_config: OutputConfig |
|
|
return_all_generated_tokens: bool |
|
|
sampling_config: SamplingConfig |
|
|
streaming: bool |
|
|
def __init__(self, input_token_ids: list[int], max_new_tokens: int, streaming: bool = False, sampling_config: SamplingConfig = ..., output_config: OutputConfig = ..., end_id: int | None = None, pad_id: int | None = None, bad_words: list[list[int]] | None = None, stop_words: list[list[int]] | None = None, embedding_bias: torch.Tensor | None = None, external_draft_tokens_config: ExternalDraftTokensConfig | None = None, prompt_tuning_config: PromptTuningConfig | None = None, lora_config: LoraConfig | None = None, logits_post_processor_name: str | None = None, encoder_input_token_ids: list[int] | None = None, client_id: int | None = None, return_all_generated_tokens: bool = False) -> None: |
|
|
... |
|
|
@property |
|
|
def bad_words(self) -> list[list[int]] | None: |
|
|
... |
|
|
@bad_words.setter |
|
|
def bad_words(self, arg1: list[list[int]]) -> None: |
|
|
... |
|
|
@property |
|
|
def client_id(self) -> int | None: |
|
|
... |
|
|
@client_id.setter |
|
|
def client_id(self, arg1: int) -> None: |
|
|
... |
|
|
@property |
|
|
def embedding_bias(self) -> torch.Tensor | None: |
|
|
... |
|
|
@embedding_bias.setter |
|
|
def embedding_bias(self, arg1: torch.Tensor) -> None: |
|
|
... |
|
|
@property |
|
|
def encoder_input_token_ids(self) -> list[int] | None: |
|
|
... |
|
|
@encoder_input_token_ids.setter |
|
|
def encoder_input_token_ids(self, arg1: list[int]) -> None: |
|
|
... |
|
|
@property |
|
|
def end_id(self) -> int | None: |
|
|
... |
|
|
@end_id.setter |
|
|
def end_id(self, arg1: int) -> None: |
|
|
... |
|
|
@property |
|
|
def external_draft_tokens_config(self) -> ExternalDraftTokensConfig | None: |
|
|
... |
|
|
@external_draft_tokens_config.setter |
|
|
def external_draft_tokens_config(self, arg1: ExternalDraftTokensConfig) -> None: |
|
|
... |
|
|
@property |
|
|
def input_token_ids(self) -> list[int]: |
|
|
... |
|
|
@property |
|
|
def logits_post_processor_name(self) -> str | None: |
|
|
... |
|
|
@logits_post_processor_name.setter |
|
|
def logits_post_processor_name(self, arg1: str) -> None: |
|
|
... |
|
|
@property |
|
|
def lora_config(self) -> LoraConfig | None: |
|
|
... |
|
|
@lora_config.setter |
|
|
def lora_config(self, arg1: LoraConfig) -> None: |
|
|
... |
|
|
@property |
|
|
def max_new_tokens(self) -> int: |
|
|
... |
|
|
@property |
|
|
def pad_id(self) -> int | None: |
|
|
... |
|
|
@pad_id.setter |
|
|
def pad_id(self, arg1: int) -> None: |
|
|
... |
|
|
@property |
|
|
def prompt_tuning_config(self) -> PromptTuningConfig | None: |
|
|
... |
|
|
@prompt_tuning_config.setter |
|
|
def prompt_tuning_config(self, arg1: PromptTuningConfig) -> None: |
|
|
... |
|
|
@property |
|
|
def stop_words(self) -> list[list[int]] | None: |
|
|
... |
|
|
@stop_words.setter |
|
|
def stop_words(self, arg1: list[list[int]]) -> None: |
|
|
... |
|
|
class RequestStage: |
|
|
""" |
|
|
Members: |
|
|
|
|
|
QUEUED |
|
|
|
|
|
ENCODER_IN_PROGRESS |
|
|
|
|
|
CONTEXT_IN_PROGRESS |
|
|
|
|
|
GENERATION_IN_PROGRESS |
|
|
|
|
|
GENERATION_COMPLETE |
|
|
""" |
|
|
CONTEXT_IN_PROGRESS: typing.ClassVar[RequestStage] |
|
|
ENCODER_IN_PROGRESS: typing.ClassVar[RequestStage] |
|
|
GENERATION_COMPLETE: typing.ClassVar[RequestStage] |
|
|
GENERATION_IN_PROGRESS: typing.ClassVar[RequestStage] |
|
|
QUEUED: typing.ClassVar[RequestStage] |
|
|
__members__: typing.ClassVar[dict[str, RequestStage]] |
|
|
def __eq__(self, other: typing.Any) -> bool: |
|
|
... |
|
|
def __getstate__(self) -> int: |
|
|
... |
|
|
def __hash__(self) -> int: |
|
|
... |
|
|
def __index__(self) -> int: |
|
|
... |
|
|
def __init__(self, value: int) -> None: |
|
|
... |
|
|
def __int__(self) -> int: |
|
|
... |
|
|
def __ne__(self, other: typing.Any) -> bool: |
|
|
... |
|
|
def __repr__(self) -> str: |
|
|
... |
|
|
def __setstate__(self, state: int) -> None: |
|
|
... |
|
|
def __str__(self) -> str: |
|
|
... |
|
|
@property |
|
|
def name(self) -> str: |
|
|
... |
|
|
@property |
|
|
def value(self) -> int: |
|
|
... |
|
|
class RequestStats: |
|
|
avg_num_decoded_tokens_per_iter: float |
|
|
context_prefill_position: int |
|
|
id: int |
|
|
num_generated_tokens: int |
|
|
paused: bool |
|
|
scheduled: bool |
|
|
stage: RequestStage |
|
|
def __init__(self) -> None: |
|
|
... |
|
|
def to_json_str(self) -> str: |
|
|
... |
|
|
class RequestStatsPerIteration: |
|
|
iter: int |
|
|
request_stats: list[RequestStats] |
|
|
def __init__(self) -> None: |
|
|
... |
|
|
def to_json_str(self) -> str: |
|
|
... |
|
|
class Response: |
|
|
@typing.overload |
|
|
def __init__(self, request_id: int, error_msg: str) -> None: |
|
|
... |
|
|
@typing.overload |
|
|
def __init__(self, request_id: int, result: Result) -> None: |
|
|
... |
|
|
def has_error(self) -> bool: |
|
|
... |
|
|
@property |
|
|
def error_msg(self) -> str: |
|
|
... |
|
|
@property |
|
|
def request_id(self) -> int: |
|
|
... |
|
|
@property |
|
|
def result(self) -> Result: |
|
|
... |
|
|
class Result: |
|
|
context_logits: torch.Tensor | None |
|
|
cum_log_probs: list[float] | None |
|
|
encoder_output: torch.Tensor | None |
|
|
generation_logits: torch.Tensor | None |
|
|
is_final: bool |
|
|
log_probs: list[list[float]] | None |
|
|
output_token_ids: list[list[int]] |
|
|
def __init__(self) -> None: |
|
|
... |
|
|
class SamplingConfig: |
|
|
beam_search_diversity_rate: float | None |
|
|
beam_width: int |
|
|
early_stopping: int | None |
|
|
frequency_penalty: float | None |
|
|
length_penalty: float | None |
|
|
min_length: int | None |
|
|
no_repeat_ngram_size: int | None |
|
|
presence_penalty: float | None |
|
|
random_seed: int | None |
|
|
repetition_penalty: float | None |
|
|
temperature: float | None |
|
|
top_k: int | None |
|
|
top_p: float | None |
|
|
top_p_decay: float | None |
|
|
top_p_min: float | None |
|
|
top_p_reset_ids: int | None |
|
|
def __init__(self, beam_width: int = 1, top_k: int | None = None, top_p: float | None = None, top_p_min: float | None = None, top_p_reset_ids: int | None = None, top_p_decay: float | None = None, random_seed: int | None = None, temperature: float | None = None, min_length: int | None = None, beam_search_diversity_rate: float | None = None, repetition_penalty: float | None = None, presence_penalty: float | None = None, frequency_penalty: float | None = None, length_penalty: float | None = None, early_stopping: int | None = None, no_repeat_ngram_size: int | None = None) -> None: |
|
|
... |
|
|
class SchedulerConfig: |
|
|
def __getstate__(self) -> tuple: |
|
|
... |
|
|
@typing.overload |
|
|
def __init__(self, capacity_scheduler_policy: CapacitySchedulerPolicy = ...) -> None: |
|
|
... |
|
|
@typing.overload |
|
|
def __init__(self, capacity_scheduler_policy: CapacitySchedulerPolicy, context_chunking_policy: ContextChunkingPolicy | None) -> None: |
|
|
... |
|
|
def __setstate__(self, arg0: tuple) -> None: |
|
|
... |
|
|
@property |
|
|
def capacity_scheduler_policy(self) -> CapacitySchedulerPolicy: |
|
|
... |
|
|
@property |
|
|
def context_chunking_policy(self) -> ContextChunkingPolicy | None: |
|
|
... |
|
|
class StaticBatchingStats: |
|
|
empty_gen_slots: int |
|
|
num_context_requests: int |
|
|
num_ctx_tokens: int |
|
|
num_gen_tokens: int |
|
|
num_scheduled_requests: int |
|
|
def __init__(self) -> None: |
|
|
... |
|
|
__version__: str = '0.12.0.dev2024072300' |
|
|
|