t12
File size: 28,094 Bytes
"""
Executor bindings
"""
from __future__ import annotations
import datetime
import os
import torch
import typing
__all__ = ['BatchingType', 'CapacitySchedulerPolicy', 'CommunicationMode', 'CommunicationType', 'ContextChunkingPolicy', 'DecodingConfig', 'DecodingMode', 'Executor', 'ExecutorConfig', 'ExternalDraftTokensConfig', 'InflightBatchingStats', 'IterationStats', 'KvCacheConfig', 'KvCacheStats', 'LookaheadDecodingConfig', 'LoraConfig', 'ModelType', 'OrchestratorConfig', 'OutputConfig', 'ParallelConfig', 'PeftCacheConfig', 'PromptTuningConfig', 'Request', 'RequestStage', 'RequestStats', 'RequestStatsPerIteration', 'Response', 'Result', 'SamplingConfig', 'SchedulerConfig', 'StaticBatchingStats']
class BatchingType:
    """
    Members:
    
      STATIC
    
      INFLIGHT
    """
    INFLIGHT: typing.ClassVar[BatchingType]  # value = <BatchingType.INFLIGHT: 1>
    STATIC: typing.ClassVar[BatchingType]  # value = <BatchingType.STATIC: 0>
    __members__: typing.ClassVar[dict[str, BatchingType]]  # value = {'STATIC': <BatchingType.STATIC: 0>, 'INFLIGHT': <BatchingType.INFLIGHT: 1>}
    def __eq__(self, other: typing.Any) -> bool:
        ...
    def __getstate__(self) -> int:
        ...
    def __hash__(self) -> int:
        ...
    def __index__(self) -> int:
        ...
    def __init__(self, value: int) -> None:
        ...
    def __int__(self) -> int:
        ...
    def __ne__(self, other: typing.Any) -> bool:
        ...
    def __repr__(self) -> str:
        ...
    def __setstate__(self, state: int) -> None:
        ...
    def __str__(self) -> str:
        ...
    @property
    def name(self) -> str:
        ...
    @property
    def value(self) -> int:
        ...
class CapacitySchedulerPolicy:
    """
    Members:
    
      MAX_UTILIZATION
    
      GUARANTEED_NO_EVICT
    """
    GUARANTEED_NO_EVICT: typing.ClassVar[CapacitySchedulerPolicy]  # value = <CapacitySchedulerPolicy.GUARANTEED_NO_EVICT: 1>
    MAX_UTILIZATION: typing.ClassVar[CapacitySchedulerPolicy]  # value = <CapacitySchedulerPolicy.MAX_UTILIZATION: 0>
    __members__: typing.ClassVar[dict[str, CapacitySchedulerPolicy]]  # value = {'MAX_UTILIZATION': <CapacitySchedulerPolicy.MAX_UTILIZATION: 0>, 'GUARANTEED_NO_EVICT': <CapacitySchedulerPolicy.GUARANTEED_NO_EVICT: 1>}
    def __eq__(self, other: typing.Any) -> bool:
        ...
    def __getstate__(self) -> int:
        ...
    def __hash__(self) -> int:
        ...
    def __index__(self) -> int:
        ...
    def __init__(self, value: int) -> None:
        ...
    def __int__(self) -> int:
        ...
    def __ne__(self, other: typing.Any) -> bool:
        ...
    def __repr__(self) -> str:
        ...
    def __setstate__(self, state: int) -> None:
        ...
    def __str__(self) -> str:
        ...
    @property
    def name(self) -> str:
        ...
    @property
    def value(self) -> int:
        ...
class CommunicationMode:
    """
    Members:
    
      LEADER
    
      ORCHESTRATOR
    """
    LEADER: typing.ClassVar[CommunicationMode]  # value = <CommunicationMode.LEADER: 0>
    ORCHESTRATOR: typing.ClassVar[CommunicationMode]  # value = <CommunicationMode.ORCHESTRATOR: 1>
    __members__: typing.ClassVar[dict[str, CommunicationMode]]  # value = {'LEADER': <CommunicationMode.LEADER: 0>, 'ORCHESTRATOR': <CommunicationMode.ORCHESTRATOR: 1>}
    def __eq__(self, other: typing.Any) -> bool:
        ...
    def __getstate__(self) -> int:
        ...
    def __hash__(self) -> int:
        ...
    def __index__(self) -> int:
        ...
    def __init__(self, value: int) -> None:
        ...
    def __int__(self) -> int:
        ...
    def __ne__(self, other: typing.Any) -> bool:
        ...
    def __repr__(self) -> str:
        ...
    def __setstate__(self, state: int) -> None:
        ...
    def __str__(self) -> str:
        ...
    @property
    def name(self) -> str:
        ...
    @property
    def value(self) -> int:
        ...
class CommunicationType:
    """
    Members:
    
      MPI
    """
    MPI: typing.ClassVar[CommunicationType]  # value = <CommunicationType.MPI: 0>
    __members__: typing.ClassVar[dict[str, CommunicationType]]  # value = {'MPI': <CommunicationType.MPI: 0>}
    def __eq__(self, other: typing.Any) -> bool:
        ...
    def __getstate__(self) -> int:
        ...
    def __hash__(self) -> int:
        ...
    def __index__(self) -> int:
        ...
    def __init__(self, value: int) -> None:
        ...
    def __int__(self) -> int:
        ...
    def __ne__(self, other: typing.Any) -> bool:
        ...
    def __repr__(self) -> str:
        ...
    def __setstate__(self, state: int) -> None:
        ...
    def __str__(self) -> str:
        ...
    @property
    def name(self) -> str:
        ...
    @property
    def value(self) -> int:
        ...
class ContextChunkingPolicy:
    """
    Members:
    
      EQUAL_PROGRESS
    
      FIRST_COME_FIRST_SERVED
    """
    EQUAL_PROGRESS: typing.ClassVar[ContextChunkingPolicy]  # value = <ContextChunkingPolicy.EQUAL_PROGRESS: 1>
    FIRST_COME_FIRST_SERVED: typing.ClassVar[ContextChunkingPolicy]  # value = <ContextChunkingPolicy.FIRST_COME_FIRST_SERVED: 0>
    __members__: typing.ClassVar[dict[str, ContextChunkingPolicy]]  # value = {'EQUAL_PROGRESS': <ContextChunkingPolicy.EQUAL_PROGRESS: 1>, 'FIRST_COME_FIRST_SERVED': <ContextChunkingPolicy.FIRST_COME_FIRST_SERVED: 0>}
    def __eq__(self, other: typing.Any) -> bool:
        ...
    def __getstate__(self) -> int:
        ...
    def __hash__(self) -> int:
        ...
    def __index__(self) -> int:
        ...
    def __init__(self, value: int) -> None:
        ...
    def __int__(self) -> int:
        ...
    def __ne__(self, other: typing.Any) -> bool:
        ...
    def __repr__(self) -> str:
        ...
    def __setstate__(self, state: int) -> None:
        ...
    def __str__(self) -> str:
        ...
    @property
    def name(self) -> str:
        ...
    @property
    def value(self) -> int:
        ...
class DecodingConfig:
    def __init__(self, decoding_mode: DecodingMode | None = None, lookahead_decoding_config: LookaheadDecodingConfig | None = None, medusa_choices: list[list[int]] | None = None) -> None:
        ...
    @property
    def decoding_mode(self) -> DecodingMode | None:
        ...
    @decoding_mode.setter
    def decoding_mode(self, arg1: DecodingMode) -> None:
        ...
    @property
    def lookahead_decoding_config(self) -> LookaheadDecodingConfig | None:
        ...
    @lookahead_decoding_config.setter
    def lookahead_decoding_config(self, arg1: LookaheadDecodingConfig) -> None:
        ...
    @property
    def medusa_choices(self) -> list[list[int]] | None:
        ...
    @medusa_choices.setter
    def medusa_choices(self, arg1: list[list[int]]) -> None:
        ...
class DecodingMode:
    @staticmethod
    def Auto() -> DecodingMode:
        ...
    @staticmethod
    def BeamSearch() -> DecodingMode:
        ...
    @staticmethod
    def Lookahead() -> DecodingMode:
        ...
    @staticmethod
    def Medusa() -> DecodingMode:
        ...
    @staticmethod
    def TopK() -> DecodingMode:
        ...
    @staticmethod
    def TopKTopP() -> DecodingMode:
        ...
    @staticmethod
    def TopP() -> DecodingMode:
        ...
    def isAuto(self) -> bool:
        ...
    def isBeamSearch(self) -> bool:
        ...
    def isLookahead(self) -> bool:
        ...
    def isMedusa(self) -> bool:
        ...
    def isTopK(self) -> bool:
        ...
    def isTopKandTopP(self) -> bool:
        ...
    def isTopKorTopP(self) -> bool:
        ...
    def isTopP(self) -> bool:
        ...
class Executor:
    def __enter__(self) -> typing.Any:
        ...
    def __exit__(self, arg0: typing.Any, arg1: typing.Any, arg2: typing.Any) -> None:
        ...
    @typing.overload
    def __init__(self, model_path: os.PathLike, model_type: ModelType, executor_config: ExecutorConfig) -> None:
        ...
    @typing.overload
    def __init__(self, encoder_model_path: os.PathLike, decoder_model_path: os.PathLike, model_type: ModelType, executor_config: ExecutorConfig) -> None:
        ...
    @typing.overload
    def __init__(self, engine_buffer: str, json_config_str: str, model_type: ModelType, executor_config: ExecutorConfig) -> None:
        ...
    @typing.overload
    def __init__(self, encoder_engine_buffer: str, encoder_json_config_str: str, decoder_engine_buffer: str, decoder_json_config_str: str, model_type: ModelType, executor_config: ExecutorConfig) -> None:
        ...
    @typing.overload
    def await_responses(self, timeout: datetime.timedelta | None = None) -> list[Response]:
        ...
    @typing.overload
    def await_responses(self, id: int, timeout: datetime.timedelta | None = None) -> list[Response]:
        ...
    @typing.overload
    def await_responses(self, ids: list[int], timeout: datetime.timedelta | None = None) -> list[list[Response]]:
        ...
    def can_enqueue_requests(self) -> bool:
        ...
    def cancel_request(self, id: int = None) -> None:
        ...
    def enqueue_request(self, request: Request) -> int:
        ...
    def enqueue_requests(self, requests: list[Request]) -> list[int]:
        ...
    def get_latest_iteration_stats(self) -> list[IterationStats]:
        ...
    def get_latest_request_stats(self) -> list[RequestStatsPerIteration]:
        ...
    def get_num_responses_ready(self, id: int | None = None) -> int:
        ...
    def shutdown(self) -> None:
        ...
class ExecutorConfig:
    batching_type: BatchingType
    enable_chunked_context: bool
    gpu_weights_percent: float
    iter_stats_max_iterations: int
    kv_cache_config: KvCacheConfig
    max_beam_width: int
    max_queue_size: int | None
    multi_block_mode: bool
    normalize_log_probs: bool
    request_stats_max_iterations: int
    scheduler_config: SchedulerConfig
    def __getstate__(self) -> tuple:
        ...
    def __init__(self, max_beam_width: int = 1, scheduler_config: SchedulerConfig = ..., kv_cache_config: KvCacheConfig = ..., enable_chunked_context: bool = False, normalize_log_probs: bool = True, iter_stats_max_iterations: int = 1000, request_stats_max_iterations: int = 0, batching_type: BatchingType = ..., max_batch_size: int | None = None, max_num_tokens: int | None = None, parallel_config: ParallelConfig | None = None, peft_cache_config: PeftCacheConfig = ..., logits_post_processor_map: dict[str, typing.Callable[[int, torch.Tensor, list[list[int]], int, int | None], None]] | None = None, logits_post_processor_batched: typing.Callable[[list[int], list[torch.Tensor], list[list[list[int]]], int, list[int | None]], None] | None = None, decoding_config: DecodingConfig | None = None, gpu_weights_percent: float = 1.0, max_queue_size: int | None = None, multi_block_mode: bool = False) -> None:
        ...
    def __setstate__(self, arg0: tuple) -> None:
        ...
    @property
    def decoding_config(self) -> DecodingConfig | None:
        ...
    @decoding_config.setter
    def decoding_config(self, arg1: DecodingConfig) -> None:
        ...
    @property
    def logits_post_processor_batched(self) -> typing.Callable[[list[int], list[torch.Tensor], list[list[list[int]]], int, list[int | None]], None] | None:
        ...
    @logits_post_processor_batched.setter
    def logits_post_processor_batched(self, arg1: typing.Callable[[list[int], list[torch.Tensor], list[list[list[int]]], int, list[int | None]], None]) -> None:
        ...
    @property
    def logits_post_processor_map(self) -> dict[str, typing.Callable[[int, torch.Tensor, list[list[int]], int, int | None], None]] | None:
        ...
    @logits_post_processor_map.setter
    def logits_post_processor_map(self, arg1: dict[str, typing.Callable[[int, torch.Tensor, list[list[int]], int, int | None], None]]) -> None:
        ...
    @property
    def max_batch_size(self) -> int | None:
        ...
    @max_batch_size.setter
    def max_batch_size(self, arg1: int) -> None:
        ...
    @property
    def max_num_tokens(self) -> int | None:
        ...
    @max_num_tokens.setter
    def max_num_tokens(self, arg1: int) -> None:
        ...
    @property
    def parallel_config(self) -> ParallelConfig | None:
        ...
    @parallel_config.setter
    def parallel_config(self, arg1: ParallelConfig) -> None:
        ...
    @property
    def peft_cache_config(self) -> PeftCacheConfig | None:
        ...
    @peft_cache_config.setter
    def peft_cache_config(self, arg1: PeftCacheConfig) -> None:
        ...
class ExternalDraftTokensConfig:
    def __init__(self, tokens: list[int], logits: torch.Tensor | None = None, acceptance_threshold: float | None = None) -> None:
        ...
    @property
    def acceptance_threshold(self) -> float | None:
        ...
    @property
    def logits(self) -> torch.Tensor | None:
        ...
    @property
    def tokens(self) -> list[int]:
        ...
class InflightBatchingStats:
    avg_num_decoded_tokens_per_iter: float
    micro_batch_id: int
    num_context_requests: int
    num_ctx_tokens: int
    num_gen_requests: int
    num_paused_requests: int
    num_scheduled_requests: int
    def __init__(self) -> None:
        ...
class IterationStats:
    cpu_mem_usage: int
    gpu_mem_usage: int
    inflight_batching_stats: InflightBatchingStats | None
    iter: int
    iter_latency_ms: float
    kv_cache_stats: KvCacheStats | None
    max_num_active_requests: int
    num_active_requests: int
    num_queued_requests: int
    pinned_mem_usage: int
    static_batching_stats: StaticBatchingStats | None
    timestamp: str
    def __init__(self) -> None:
        ...
    def to_json_str(self) -> str:
        ...
class KvCacheConfig:
    enable_block_reuse: bool
    onboard_blocks: bool
    def __getstate__(self) -> tuple:
        ...
    def __init__(self, enable_block_reuse: bool = False, max_tokens: int | None = None, max_attention_window: int | None = None, sink_token_length: int | None = None, free_gpu_memory_fraction: float | None = None, host_cache_size: int | None = None, onboard_blocks: bool = True) -> None:
        ...
    def __setstate__(self, arg0: tuple) -> None:
        ...
    @property
    def free_gpu_memory_fraction(self) -> float | None:
        ...
    @free_gpu_memory_fraction.setter
    def free_gpu_memory_fraction(self, arg1: float) -> None:
        ...
    @property
    def host_cache_size(self) -> int | None:
        ...
    @host_cache_size.setter
    def host_cache_size(self, arg1: int) -> None:
        ...
    @property
    def max_attention_window(self) -> int | None:
        ...
    @max_attention_window.setter
    def max_attention_window(self, arg1: int) -> None:
        ...
    @property
    def max_tokens(self) -> int | None:
        ...
    @max_tokens.setter
    def max_tokens(self, arg1: int) -> None:
        ...
    @property
    def sink_token_length(self) -> int | None:
        ...
    @sink_token_length.setter
    def sink_token_length(self, arg1: int) -> None:
        ...
class KvCacheStats:
    alloc_new_blocks: int
    alloc_total_blocks: int
    free_num_blocks: int
    max_num_blocks: int
    reused_blocks: int
    tokens_per_block: int
    used_num_blocks: int
    def __init__(self) -> None:
        ...
class LookaheadDecodingConfig:
    def __init__(self, max_window_size: int, max_ngram_size: int, max_verification_set_size: int) -> None:
        ...
    @property
    def max_ngram_size(self) -> int:
        ...
    @property
    def max_verification_set_size(self) -> int:
        ...
    @property
    def max_window_size(self) -> int:
        ...
class LoraConfig:
    def __init__(self, task_id: int, weights: torch.Tensor | None = None, config: torch.Tensor | None = None) -> None:
        ...
    @property
    def config(self) -> torch.Tensor | None:
        ...
    @property
    def task_id(self) -> int:
        ...
    @property
    def weights(self) -> torch.Tensor | None:
        ...
class ModelType:
    """
    Members:
    
      DECODER_ONLY
    
      ENCODER_ONLY
    
      ENCODER_DECODER
    """
    DECODER_ONLY: typing.ClassVar[ModelType]  # value = <ModelType.DECODER_ONLY: 0>
    ENCODER_DECODER: typing.ClassVar[ModelType]  # value = <ModelType.ENCODER_DECODER: 2>
    ENCODER_ONLY: typing.ClassVar[ModelType]  # value = <ModelType.ENCODER_ONLY: 1>
    __members__: typing.ClassVar[dict[str, ModelType]]  # value = {'DECODER_ONLY': <ModelType.DECODER_ONLY: 0>, 'ENCODER_ONLY': <ModelType.ENCODER_ONLY: 1>, 'ENCODER_DECODER': <ModelType.ENCODER_DECODER: 2>}
    def __eq__(self, other: typing.Any) -> bool:
        ...
    def __getstate__(self) -> int:
        ...
    def __hash__(self) -> int:
        ...
    def __index__(self) -> int:
        ...
    def __init__(self, value: int) -> None:
        ...
    def __int__(self) -> int:
        ...
    def __ne__(self, other: typing.Any) -> bool:
        ...
    def __repr__(self) -> str:
        ...
    def __setstate__(self, state: int) -> None:
        ...
    def __str__(self) -> str:
        ...
    @property
    def name(self) -> str:
        ...
    @property
    def value(self) -> int:
        ...
class OrchestratorConfig:
    is_orchestrator: bool
    worker_executable_path: str
    def __init__(self, is_orchestrator: bool = True, worker_executable_path: str = '') -> None:
        ...
class OutputConfig:
    exclude_input_from_output: bool
    return_context_logits: bool
    return_encoder_output: bool
    return_generation_logits: bool
    return_log_probs: bool
    def __init__(self, return_log_probs: bool = False, return_context_logits: bool = False, return_generation_logits: bool = False, exclude_input_from_output: bool = False, return_encoder_output: bool = False) -> None:
        ...
class ParallelConfig:
    communication_mode: CommunicationMode
    communication_type: CommunicationType
    def __getstate__(self) -> tuple:
        ...
    def __init__(self, communication_type: CommunicationType = ..., communication_mode: CommunicationMode = ..., device_ids: list[int] | None = None, participant_ids: list[int] | None = None, orchestrator_config: OrchestratorConfig | None = None) -> None:
        ...
    def __setstate__(self, arg0: tuple) -> None:
        ...
    @property
    def device_ids(self) -> list[int] | None:
        ...
    @device_ids.setter
    def device_ids(self, arg1: list[int]) -> None:
        ...
    @property
    def orchestrator_config(self) -> OrchestratorConfig | None:
        ...
    @orchestrator_config.setter
    def orchestrator_config(self, arg1: OrchestratorConfig) -> None:
        ...
    @property
    def participant_ids(self) -> list[int] | None:
        ...
    @participant_ids.setter
    def participant_ids(self, arg1: list[int]) -> None:
        ...
class PeftCacheConfig:
    def __getstate__(self) -> tuple:
        ...
    def __init__(self, num_host_module_layer: int = 0, num_device_module_layer: int = 0, optimal_adapter_size: int = 8, max_adapter_size: int = 64, num_put_workers: int = 1, num_ensure_workers: int = 1, num_copy_streams: int = 1, max_pages_per_block_host: int = 24, max_pages_per_block_device: int = 8, device_cache_percent: float | None = None, host_cache_size: int | None = None) -> None:
        ...
    def __setstate__(self, arg0: tuple) -> None:
        ...
    @property
    def device_cache_percent(self) -> float | None:
        ...
    @property
    def host_cache_size(self) -> int | None:
        ...
    @property
    def max_adapter_size(self) -> int:
        ...
    @property
    def max_pages_per_block_device(self) -> int:
        ...
    @property
    def max_pages_per_block_host(self) -> int:
        ...
    @property
    def num_copy_streams(self) -> int:
        ...
    @property
    def num_device_module_layer(self) -> int:
        ...
    @property
    def num_ensure_workers(self) -> int:
        ...
    @property
    def num_host_module_layer(self) -> int:
        ...
    @property
    def num_put_workers(self) -> int:
        ...
    @property
    def optimal_adapter_size(self) -> int:
        ...
class PromptTuningConfig:
    def __init__(self, embedding_table: torch.Tensor) -> None:
        ...
    @property
    def embedding_table(self) -> torch.Tensor:
        ...
class Request:
    BATCHED_POST_PROCESSOR_NAME: typing.ClassVar[str] = 'batched'
    output_config: OutputConfig
    return_all_generated_tokens: bool
    sampling_config: SamplingConfig
    streaming: bool
    def __init__(self, input_token_ids: list[int], max_new_tokens: int, streaming: bool = False, sampling_config: SamplingConfig = ..., output_config: OutputConfig = ..., end_id: int | None = None, pad_id: int | None = None, bad_words: list[list[int]] | None = None, stop_words: list[list[int]] | None = None, embedding_bias: torch.Tensor | None = None, external_draft_tokens_config: ExternalDraftTokensConfig | None = None, prompt_tuning_config: PromptTuningConfig | None = None, lora_config: LoraConfig | None = None, logits_post_processor_name: str | None = None, encoder_input_token_ids: list[int] | None = None, client_id: int | None = None, return_all_generated_tokens: bool = False) -> None:
        ...
    @property
    def bad_words(self) -> list[list[int]] | None:
        ...
    @bad_words.setter
    def bad_words(self, arg1: list[list[int]]) -> None:
        ...
    @property
    def client_id(self) -> int | None:
        ...
    @client_id.setter
    def client_id(self, arg1: int) -> None:
        ...
    @property
    def embedding_bias(self) -> torch.Tensor | None:
        ...
    @embedding_bias.setter
    def embedding_bias(self, arg1: torch.Tensor) -> None:
        ...
    @property
    def encoder_input_token_ids(self) -> list[int] | None:
        ...
    @encoder_input_token_ids.setter
    def encoder_input_token_ids(self, arg1: list[int]) -> None:
        ...
    @property
    def end_id(self) -> int | None:
        ...
    @end_id.setter
    def end_id(self, arg1: int) -> None:
        ...
    @property
    def external_draft_tokens_config(self) -> ExternalDraftTokensConfig | None:
        ...
    @external_draft_tokens_config.setter
    def external_draft_tokens_config(self, arg1: ExternalDraftTokensConfig) -> None:
        ...
    @property
    def input_token_ids(self) -> list[int]:
        ...
    @property
    def logits_post_processor_name(self) -> str | None:
        ...
    @logits_post_processor_name.setter
    def logits_post_processor_name(self, arg1: str) -> None:
        ...
    @property
    def lora_config(self) -> LoraConfig | None:
        ...
    @lora_config.setter
    def lora_config(self, arg1: LoraConfig) -> None:
        ...
    @property
    def max_new_tokens(self) -> int:
        ...
    @property
    def pad_id(self) -> int | None:
        ...
    @pad_id.setter
    def pad_id(self, arg1: int) -> None:
        ...
    @property
    def prompt_tuning_config(self) -> PromptTuningConfig | None:
        ...
    @prompt_tuning_config.setter
    def prompt_tuning_config(self, arg1: PromptTuningConfig) -> None:
        ...
    @property
    def stop_words(self) -> list[list[int]] | None:
        ...
    @stop_words.setter
    def stop_words(self, arg1: list[list[int]]) -> None:
        ...
class RequestStage:
    """
    Members:
    
      QUEUED
    
      ENCODER_IN_PROGRESS
    
      CONTEXT_IN_PROGRESS
    
      GENERATION_IN_PROGRESS
    
      GENERATION_COMPLETE
    """
    CONTEXT_IN_PROGRESS: typing.ClassVar[RequestStage]  # value = <RequestStage.CONTEXT_IN_PROGRESS: 2>
    ENCODER_IN_PROGRESS: typing.ClassVar[RequestStage]  # value = <RequestStage.ENCODER_IN_PROGRESS: 1>
    GENERATION_COMPLETE: typing.ClassVar[RequestStage]  # value = <RequestStage.GENERATION_COMPLETE: 4>
    GENERATION_IN_PROGRESS: typing.ClassVar[RequestStage]  # value = <RequestStage.GENERATION_IN_PROGRESS: 3>
    QUEUED: typing.ClassVar[RequestStage]  # value = <RequestStage.QUEUED: 0>
    __members__: typing.ClassVar[dict[str, RequestStage]]  # value = {'QUEUED': <RequestStage.QUEUED: 0>, 'ENCODER_IN_PROGRESS': <RequestStage.ENCODER_IN_PROGRESS: 1>, 'CONTEXT_IN_PROGRESS': <RequestStage.CONTEXT_IN_PROGRESS: 2>, 'GENERATION_IN_PROGRESS': <RequestStage.GENERATION_IN_PROGRESS: 3>, 'GENERATION_COMPLETE': <RequestStage.GENERATION_COMPLETE: 4>}
    def __eq__(self, other: typing.Any) -> bool:
        ...
    def __getstate__(self) -> int:
        ...
    def __hash__(self) -> int:
        ...
    def __index__(self) -> int:
        ...
    def __init__(self, value: int) -> None:
        ...
    def __int__(self) -> int:
        ...
    def __ne__(self, other: typing.Any) -> bool:
        ...
    def __repr__(self) -> str:
        ...
    def __setstate__(self, state: int) -> None:
        ...
    def __str__(self) -> str:
        ...
    @property
    def name(self) -> str:
        ...
    @property
    def value(self) -> int:
        ...
class RequestStats:
    avg_num_decoded_tokens_per_iter: float
    context_prefill_position: int
    id: int
    num_generated_tokens: int
    paused: bool
    scheduled: bool
    stage: RequestStage
    def __init__(self) -> None:
        ...
    def to_json_str(self) -> str:
        ...
class RequestStatsPerIteration:
    iter: int
    request_stats: list[RequestStats]
    def __init__(self) -> None:
        ...
    def to_json_str(self) -> str:
        ...
class Response:
    @typing.overload
    def __init__(self, request_id: int, error_msg: str) -> None:
        ...
    @typing.overload
    def __init__(self, request_id: int, result: Result) -> None:
        ...
    def has_error(self) -> bool:
        ...
    @property
    def error_msg(self) -> str:
        ...
    @property
    def request_id(self) -> int:
        ...
    @property
    def result(self) -> Result:
        ...
class Result:
    context_logits: torch.Tensor | None
    cum_log_probs: list[float] | None
    encoder_output: torch.Tensor | None
    generation_logits: torch.Tensor | None
    is_final: bool
    log_probs: list[list[float]] | None
    output_token_ids: list[list[int]]
    def __init__(self) -> None:
        ...
class SamplingConfig:
    beam_search_diversity_rate: float | None
    beam_width: int
    early_stopping: int | None
    frequency_penalty: float | None
    length_penalty: float | None
    min_length: int | None
    no_repeat_ngram_size: int | None
    presence_penalty: float | None
    random_seed: int | None
    repetition_penalty: float | None
    temperature: float | None
    top_k: int | None
    top_p: float | None
    top_p_decay: float | None
    top_p_min: float | None
    top_p_reset_ids: int | None
    def __init__(self, beam_width: int = 1, top_k: int | None = None, top_p: float | None = None, top_p_min: float | None = None, top_p_reset_ids: int | None = None, top_p_decay: float | None = None, random_seed: int | None = None, temperature: float | None = None, min_length: int | None = None, beam_search_diversity_rate: float | None = None, repetition_penalty: float | None = None, presence_penalty: float | None = None, frequency_penalty: float | None = None, length_penalty: float | None = None, early_stopping: int | None = None, no_repeat_ngram_size: int | None = None) -> None:
        ...
class SchedulerConfig:
    def __getstate__(self) -> tuple:
        ...
    @typing.overload
    def __init__(self, capacity_scheduler_policy: CapacitySchedulerPolicy = ...) -> None:
        ...
    @typing.overload
    def __init__(self, capacity_scheduler_policy: CapacitySchedulerPolicy, context_chunking_policy: ContextChunkingPolicy | None) -> None:
        ...
    def __setstate__(self, arg0: tuple) -> None:
        ...
    @property
    def capacity_scheduler_policy(self) -> CapacitySchedulerPolicy:
        ...
    @property
    def context_chunking_policy(self) -> ContextChunkingPolicy | None:
        ...
class StaticBatchingStats:
    empty_gen_slots: int
    num_context_requests: int
    num_ctx_tokens: int
    num_gen_tokens: int
    num_scheduled_requests: int
    def __init__(self) -> None:
        ...
__version__: str = '0.12.0.dev2024072300'