t12
File size: 24,265 Bytes
"""
TensorRT-LLM Python bindings for C++ runtime
"""
from __future__ import annotations
import os
import torch
import typing
from . import BuildInfo
from . import executor
from . import tensor_names
__all__ = ['BF16', 'BOOL', 'BuildInfo', 'DataType', 'FLOAT', 'FP8', 'GptJsonConfig', 'GptManager', 'GptModelVariant', 'HALF', 'INT32', 'INT64', 'INT8', 'InferenceRequest', 'KvCacheConfig', 'LlmRequest', 'LlmRequestState', 'MemoryCounters', 'ModelConfig', 'MpiComm', 'NamedTensor', 'PeftCacheManagerConfig', 'QuantMode', 'SamplingConfig', 'TrtGptModelOptionalParams', 'TrtGptModelType', 'UINT8', 'WorldConfig', 'executor', 'tensor_names']
class DataType:
    """
    Members:
    
      FLOAT
    
      HALF
    
      INT8
    
      INT32
    
      BOOL
    
      UINT8
    
      FP8
    
      BF16
    
      INT64
    """
    BF16: typing.ClassVar[DataType]  # value = <DataType.BF16: 7>
    BOOL: typing.ClassVar[DataType]  # value = <DataType.BOOL: 4>
    FLOAT: typing.ClassVar[DataType]  # value = <DataType.FLOAT: 0>
    FP8: typing.ClassVar[DataType]  # value = <DataType.FP8: 6>
    HALF: typing.ClassVar[DataType]  # value = <DataType.HALF: 1>
    INT32: typing.ClassVar[DataType]  # value = <DataType.INT32: 3>
    INT64: typing.ClassVar[DataType]  # value = <DataType.INT64: 8>
    INT8: typing.ClassVar[DataType]  # value = <DataType.INT8: 2>
    UINT8: typing.ClassVar[DataType]  # value = <DataType.UINT8: 5>
    __members__: typing.ClassVar[dict[str, DataType]]  # value = {'FLOAT': <DataType.FLOAT: 0>, 'HALF': <DataType.HALF: 1>, 'INT8': <DataType.INT8: 2>, 'INT32': <DataType.INT32: 3>, 'BOOL': <DataType.BOOL: 4>, 'UINT8': <DataType.UINT8: 5>, 'FP8': <DataType.FP8: 6>, 'BF16': <DataType.BF16: 7>, 'INT64': <DataType.INT64: 8>}
    def __eq__(self, other: typing.Any) -> bool:
        ...
    def __getstate__(self) -> int:
        ...
    def __hash__(self) -> int:
        ...
    def __index__(self) -> int:
        ...
    def __init__(self, value: int) -> None:
        ...
    def __int__(self) -> int:
        ...
    def __ne__(self, other: typing.Any) -> bool:
        ...
    def __repr__(self) -> str:
        ...
    def __setstate__(self, state: int) -> None:
        ...
    def __str__(self) -> str:
        ...
    @property
    def name(self) -> str:
        ...
    @property
    def value(self) -> int:
        ...
class GptJsonConfig:
    @staticmethod
    def parse(json: str) -> GptJsonConfig:
        ...
    @staticmethod
    def parse_file(path: os.PathLike) -> GptJsonConfig:
        ...
    def __init__(self, name: str, version: str, precision: str, tensor_parallelism: int, pipeline_parallelism: int, gpus_per_node: int, model_config: ModelConfig) -> None:
        ...
    @typing.overload
    def engine_filename(self, world_config: WorldConfig, model: str) -> str:
        ...
    @typing.overload
    def engine_filename(self, world_config: WorldConfig) -> str:
        ...
    @property
    def gpus_per_node(self) -> int:
        ...
    @property
    def model_config(self) -> ModelConfig:
        ...
    @property
    def name(self) -> str:
        ...
    @property
    def pipeline_parallelism(self) -> int:
        ...
    @property
    def precision(self) -> str:
        ...
    @property
    def tensor_parallelism(self) -> int:
        ...
    @property
    def version(self) -> str:
        ...
    @property
    def world_size(self) -> int:
        ...
class GptManager:
    def __enter__(self) -> typing.Any:
        ...
    def __exit__(self, arg0: typing.Any, arg1: typing.Any, arg2: typing.Any) -> None:
        ...
    def __init__(self, trt_engine_path: os.PathLike, model_type: TrtGptModelType, get_inference_requests_cb: typing.Callable[[int], list[InferenceRequest]], send_response_cb: typing.Callable[[int, list[NamedTensor], bool, str], None], poll_stop_signal_cb: typing.Callable[[], set[int]] = None, return_batch_manager_stats_cb: typing.Callable[[str], None] = None, optional_params: TrtGptModelOptionalParams = ..., terminate_req_id: int | None = None) -> None:
        ...
    def shutdown(self) -> None:
        ...
class GptModelVariant:
    """
    Members:
    
      GPT
    
      GLM
    
      CHATGLM
    
      MAMBA
    
      RECURRENTGEMMA
    """
    CHATGLM: typing.ClassVar[GptModelVariant]  # value = <GptModelVariant.CHATGLM: 1>
    GLM: typing.ClassVar[GptModelVariant]  # value = <GptModelVariant.GLM: 2>
    GPT: typing.ClassVar[GptModelVariant]  # value = <GptModelVariant.GPT: 0>
    MAMBA: typing.ClassVar[GptModelVariant]  # value = <GptModelVariant.MAMBA: 3>
    RECURRENTGEMMA: typing.ClassVar[GptModelVariant]  # value = <GptModelVariant.RECURRENTGEMMA: 4>
    __members__: typing.ClassVar[dict[str, GptModelVariant]]  # value = {'GPT': <GptModelVariant.GPT: 0>, 'GLM': <GptModelVariant.GLM: 2>, 'CHATGLM': <GptModelVariant.CHATGLM: 1>, 'MAMBA': <GptModelVariant.MAMBA: 3>, 'RECURRENTGEMMA': <GptModelVariant.RECURRENTGEMMA: 4>}
    def __eq__(self, other: typing.Any) -> bool:
        ...
    def __getstate__(self) -> int:
        ...
    def __hash__(self) -> int:
        ...
    def __index__(self) -> int:
        ...
    def __init__(self, value: int) -> None:
        ...
    def __int__(self) -> int:
        ...
    def __ne__(self, other: typing.Any) -> bool:
        ...
    def __repr__(self) -> str:
        ...
    def __setstate__(self, state: int) -> None:
        ...
    def __str__(self) -> str:
        ...
    @property
    def name(self) -> str:
        ...
    @property
    def value(self) -> int:
        ...
class InferenceRequest:
    bad_words_list: torch.Tensor
    beam_width: torch.Tensor
    draft_input_ids: torch.Tensor
    draft_logits: torch.Tensor
    early_stopping: torch.Tensor
    embedding_bias: torch.Tensor
    end_id: torch.Tensor
    frequency_penalty: torch.Tensor
    input_ids: torch.Tensor
    is_streaming: bool
    length_penalty: torch.Tensor
    lora_config: torch.Tensor
    lora_task_id: torch.Tensor
    lora_weights: torch.Tensor
    max_new_tokens: torch.Tensor
    min_length: torch.Tensor
    no_repeat_ngram_size: torch.Tensor
    pad_id: torch.Tensor
    presence_penalty: torch.Tensor
    prompt_embedding_table: torch.Tensor
    prompt_vocab_size: torch.Tensor
    random_seed: torch.Tensor
    repetition_penalty: torch.Tensor
    return_context_logits: torch.Tensor
    return_generation_logits: torch.Tensor
    return_log_probs: torch.Tensor
    runtime_top_k: torch.Tensor
    runtime_top_p: torch.Tensor
    stop_words_list: torch.Tensor
    temperature: torch.Tensor
    def __getstate__(self) -> bytearray:
        ...
    @typing.overload
    def __init__(self, request_id: int, logits_post_processor_callback: typing.Callable[[int, torch.Tensor, list[list[int]], torch.Stream, int | None], None] | None = None) -> None:
        ...
    @typing.overload
    def __init__(self, arg0: int, arg1: dict[str, torch.Tensor]) -> None:
        """
        deprecated: use direct tensor access instead
        """
    def __setstate__(self, arg0: bytearray) -> None:
        ...
    @property
    def request_id(self) -> int:
        ...
class KvCacheConfig:
    __hash__: typing.ClassVar[None] = None
    enable_block_reuse: bool
    free_gpu_memory_fraction: float | None
    max_attention_window: int | None
    max_tokens: int | None
    sink_token_length: int | None
    def __eq__(self, arg0: KvCacheConfig) -> bool:
        ...
    def __getstate__(self) -> tuple:
        ...
    def __init__(self, max_tokens: int | None = None, max_attention_window: int | None = None, sink_token_length: int | None = None, free_gpu_memory_fraction: float | None = None, enable_block_reuse: bool = False) -> None:
        ...
    def __setstate__(self, arg0: tuple) -> None:
        ...
class LlmRequest:
    context_chunk_size: int
    draft_tokens: list[int]
    end_id: int | None
    is_streaming: bool
    max_new_tokens: int
    max_sent_token_len: int
    pad_id: int | None
    prompt_len: int
    request_id: int
    sampling_config: SamplingConfig
    seq_slot: int | None
    state: LlmRequestState
    def __init__(self, request_id: int, max_new_tokens: int, input_tokens: list[int], sampling_config: SamplingConfig, is_streaming: bool, end_id: int | None = None, pad_id: int | None = None, embedding_bias: torch.Tensor | None = None, bad_words_list: torch.Tensor | None = None, stop_words_list: torch.Tensor | None = None, prompt_embedding_table: torch.Tensor | None = None, prompt_vocab_size: int | None = None, lora_task_id: int | None = None, lora_weights: torch.Tensor | None = None, lora_config: torch.Tensor | None = None, return_log_probs: bool = False, return_context_logits: bool = False, return_generation_logits: bool = False, draft_tokens: list[int] | None = None, draft_logits: torch.Tensor | None = None, exclude_input_from_output: bool = False, logits_post_processor: typing.Callable[[int, torch.Tensor, list[list[int]], torch.Stream, int | None], None] | None = None) -> None:
        ...
    def add_new_token(self, token: int, beam: int) -> None:
        ...
    def add_new_tokens(self, beam_tokens: list[int]) -> None:
        ...
    def get_context_remaining_length(self) -> int:
        ...
    def get_log_probs(self, arg0: int) -> list[float]:
        ...
    def get_num_tokens(self, beam: int) -> int:
        ...
    def get_token(self, beam: int, pos: int) -> int:
        ...
    @typing.overload
    def get_tokens(self, beam: int) -> list[int]:
        ...
    @typing.overload
    def get_tokens(self) -> list[list[int]]:
        ...
    def has_draft_tokens(self) -> bool:
        ...
    def is_first_context_chunk(self) -> bool:
        ...
    def is_full_context_request(self) -> bool:
        ...
    def is_last_context_chunk(self) -> bool:
        ...
    def move_to_next_context_chunk(self) -> None:
        ...
    def pause(self, max_input_len: int) -> None:
        ...
    def set_cum_log_prob(self, cum_log_prob: float, beam: int) -> None:
        ...
    def set_generated_tokens(self, generated_beam_tokens: list[list[int]]) -> None:
        ...
    def set_log_probs(self, log_probs: list[float], beam: int) -> None:
        ...
    @property
    def bad_words_list(self) -> torch.Tensor | None:
        ...
    @property
    def context_current_position(self) -> int:
        ...
    @property
    def cum_log_probs(self) -> list[float]:
        ...
    @property
    def draft_logits(self) -> torch.Tensor | None:
        ...
    @draft_logits.setter
    def draft_logits(self, arg1: torch.Tensor) -> None:
        ...
    @property
    def embedding_bias(self) -> torch.Tensor | None:
        ...
    @property
    def log_probs(self) -> list[list[float]]:
        ...
    @property
    def lora_config(self) -> torch.Tensor | None:
        ...
    @property
    def lora_task_id(self) -> int | None:
        ...
    @property
    def lora_weights(self) -> torch.Tensor | None:
        ...
    @property
    def max_beam_num_tokens(self) -> int:
        ...
    @property
    def max_num_generated_tokens(self) -> int:
        ...
    @property
    def orig_prompt_len(self) -> int:
        ...
    @property
    def prompt_embedding_table(self) -> torch.Tensor | None:
        ...
    @property
    def prompt_vocab_size(self) -> int | None:
        ...
    @property
    def return_context_logits(self, arg1: bool) -> None:
        ...
    @property
    def return_generation_logits(self, arg1: bool) -> None:
        ...
    @property
    def return_log_probs(self) -> bool:
        ...
    @property
    def stop_words_list(self) -> torch.Tensor | None:
        ...
class LlmRequestState:
    """
    Members:
    
      REQUEST_STATE_UNKNOWN
    
      REQUEST_STATE_ENCODER_INIT
    
      REQUEST_STATE_CONTEXT_INIT
    
      REQUEST_STATE_GENERATION_IN_PROGRESS
    
      REQUEST_STATE_GENERATION_TO_COMPLETE
    
      REQUEST_STATE_GENERATION_COMPLETE
    """
    REQUEST_STATE_CONTEXT_INIT: typing.ClassVar[LlmRequestState]  # value = <LlmRequestState.REQUEST_STATE_CONTEXT_INIT: 2>
    REQUEST_STATE_ENCODER_INIT: typing.ClassVar[LlmRequestState]  # value = <LlmRequestState.REQUEST_STATE_ENCODER_INIT: 1>
    REQUEST_STATE_GENERATION_COMPLETE: typing.ClassVar[LlmRequestState]  # value = <LlmRequestState.REQUEST_STATE_GENERATION_COMPLETE: 5>
    REQUEST_STATE_GENERATION_IN_PROGRESS: typing.ClassVar[LlmRequestState]  # value = <LlmRequestState.REQUEST_STATE_GENERATION_IN_PROGRESS: 3>
    REQUEST_STATE_GENERATION_TO_COMPLETE: typing.ClassVar[LlmRequestState]  # value = <LlmRequestState.REQUEST_STATE_GENERATION_TO_COMPLETE: 4>
    REQUEST_STATE_UNKNOWN: typing.ClassVar[LlmRequestState]  # value = <LlmRequestState.REQUEST_STATE_UNKNOWN: 0>
    __members__: typing.ClassVar[dict[str, LlmRequestState]]  # value = {'REQUEST_STATE_UNKNOWN': <LlmRequestState.REQUEST_STATE_UNKNOWN: 0>, 'REQUEST_STATE_ENCODER_INIT': <LlmRequestState.REQUEST_STATE_ENCODER_INIT: 1>, 'REQUEST_STATE_CONTEXT_INIT': <LlmRequestState.REQUEST_STATE_CONTEXT_INIT: 2>, 'REQUEST_STATE_GENERATION_IN_PROGRESS': <LlmRequestState.REQUEST_STATE_GENERATION_IN_PROGRESS: 3>, 'REQUEST_STATE_GENERATION_TO_COMPLETE': <LlmRequestState.REQUEST_STATE_GENERATION_TO_COMPLETE: 4>, 'REQUEST_STATE_GENERATION_COMPLETE': <LlmRequestState.REQUEST_STATE_GENERATION_COMPLETE: 5>}
    def __eq__(self, other: typing.Any) -> bool:
        ...
    def __getstate__(self) -> int:
        ...
    def __hash__(self) -> int:
        ...
    def __index__(self) -> int:
        ...
    def __init__(self, value: int) -> None:
        ...
    def __int__(self) -> int:
        ...
    def __ne__(self, other: typing.Any) -> bool:
        ...
    def __repr__(self) -> str:
        ...
    def __setstate__(self, state: int) -> None:
        ...
    def __str__(self) -> str:
        ...
    @property
    def name(self) -> str:
        ...
    @property
    def value(self) -> int:
        ...
class MemoryCounters:
    @staticmethod
    def instance() -> MemoryCounters:
        ...
    @property
    def cpu(self) -> int:
        ...
    @property
    def gpu(self) -> int:
        ...
    @property
    def pinned(self) -> int:
        ...
    @property
    def uvm(self) -> int:
        ...
class ModelConfig:
    compute_context_logits: bool
    compute_generation_logits: bool
    head_size: int
    max_batch_size: int
    max_beam_width: int
    max_input_len: int
    max_num_tokens: int | None
    max_prompt_embedding_table_size: int
    model_variant: GptModelVariant
    num_kv_heads: int
    quant_mode: QuantMode
    tokens_per_block: int
    use_gpt_attention_plugin: bool
    use_packed_input: bool
    use_paged_kv_cache: bool
    def __init__(self, vocab_size: int, num_attention_layers: int, num_rnn_layers: int, num_heads: int, hidden_size: int, data_type: DataType) -> None:
        ...
    def num_attention_layers(self, pipeline_parallelism: int = 1) -> int:
        ...
    def num_rnn_layers(self, pipeline_parallelism: int = 1) -> int:
        ...
    def vocab_size_padded(self, world_size: int) -> int:
        ...
    @property
    def data_type(self) -> DataType:
        ...
    @property
    def hidden_size(self) -> int:
        ...
    @property
    def max_seq_len(self) -> int:
        ...
    @max_seq_len.setter
    def max_seq_len(self) -> int:
        ...
    @property
    def num_heads(self) -> int:
        ...
    @property
    def size_per_head(self) -> int:
        ...
    @property
    def supports_inflight_batching(self) -> bool:
        ...
    @property
    def use_prompt_tuning(self) -> bool:
        ...
    @property
    def vocab_size(self) -> int:
        ...
class MpiComm:
    @staticmethod
    def local_init() -> None:
        ...
    @staticmethod
    def local_size() -> int:
        ...
    @staticmethod
    def rank() -> int:
        ...
    @staticmethod
    def size() -> int:
        ...
    @staticmethod
    def split(arg0: int, arg1: int) -> None:
        ...
class NamedTensor:
    tensor: torch.Tensor | None
    def __init__(self, tensor: torch.Tensor | None, name: str) -> None:
        ...
    @property
    def name(self) -> str:
        ...
class PeftCacheManagerConfig:
    device_cache_percent: float | None
    host_cache_size: int | None
    max_adapter_size: int
    max_pages_per_block_device: int
    max_pages_per_block_host: int
    num_copy_streams: int
    num_device_module_layer: int
    num_ensure_workers: int
    num_host_module_layer: int
    num_put_workers: int
    optimal_adapter_size: int
    def __init__(self, num_host_module_layer: int = 0, num_device_module_layer: int = 0, optimal_adapter_size: int = 8, max_adapter_size: int = 64, num_put_workers: int = 1, num_ensure_workers: int = 1, num_copy_streams: int = 1, max_pages_per_block_host: int = 24, max_pages_per_block_device: int = 8, device_cache_percent: float | None = None, host_cache_size: int | None = None) -> None:
        ...
class QuantMode:
    __hash__: typing.ClassVar[None] = None
    @staticmethod
    def activations() -> QuantMode:
        ...
    @staticmethod
    def fp8_kv_cache() -> QuantMode:
        ...
    @staticmethod
    def fp8_qdq() -> QuantMode:
        ...
    @staticmethod
    def from_description(quantize_weights: bool = False, quantize_activations: bool = False, per_token: bool = False, per_channel: bool = False, per_group: bool = False, use_int4_weights: bool = False, use_int8_kv_cache: bool = False, use_fp8_kv_kache: bool = False, use_fp8_qdq: bool = False, use_fp8_rowwise: bool = False) -> QuantMode:
        ...
    @staticmethod
    def from_quant_algo(quant_algo: str | None = None, kv_cache_quant_algo: str | None = None) -> QuantMode:
        ...
    @staticmethod
    def int4_weights() -> QuantMode:
        ...
    @staticmethod
    def int8_kv_cache() -> QuantMode:
        ...
    @staticmethod
    def int8_weights() -> QuantMode:
        ...
    @staticmethod
    def none() -> QuantMode:
        ...
    @staticmethod
    def per_channel_scaling() -> QuantMode:
        ...
    @staticmethod
    def per_group_scaling() -> QuantMode:
        ...
    @staticmethod
    def per_token_scaling() -> QuantMode:
        ...
    @staticmethod
    def use_smooth_quant(per_token: bool = False, per_channel: bool = False) -> QuantMode:
        ...
    @staticmethod
    def use_weight_only(use_int4_weights: bool = False, per_group: bool = False) -> QuantMode:
        ...
    def __add__(self, arg0: QuantMode) -> QuantMode:
        ...
    def __eq__(self, arg0: QuantMode) -> bool:
        ...
    def __iadd__(self, arg0: QuantMode) -> QuantMode:
        ...
    def __isub__(self, arg0: QuantMode) -> QuantMode:
        ...
    def __ne__(self, arg0: QuantMode) -> bool:
        ...
    def __sub__(self, arg0: QuantMode) -> QuantMode:
        ...
    def is_set(self, mode: QuantMode) -> bool:
        ...
    @property
    def has_activations(self) -> bool:
        ...
    @property
    def has_fp8_kv_cache(self) -> bool:
        ...
    @property
    def has_fp8_qdq(self) -> bool:
        ...
    @property
    def has_int4_weights(self) -> bool:
        ...
    @property
    def has_int8_kv_cache(self) -> bool:
        ...
    @property
    def has_int8_weights(self) -> bool:
        ...
    @property
    def has_kv_cache_quant(self) -> bool:
        ...
    @property
    def has_per_channel_scaling(self) -> bool:
        ...
    @property
    def has_per_group_scaling(self) -> bool:
        ...
    @property
    def has_per_token_scaling(self) -> bool:
        ...
    @property
    def has_static_activation_scaling(self) -> bool:
        ...
    @property
    def value(self) -> int:
        ...
class SamplingConfig:
    __hash__: typing.ClassVar[None] = None
    beam_search_diversity_rate: list[float] | None
    beam_width: int
    early_stopping: list[int] | None
    frequency_penalty: list[float] | None
    length_penalty: list[float] | None
    min_length: list[int] | None
    no_repeat_ngram_size: list[int] | None
    presence_penalty: list[float] | None
    random_seed: list[int] | None
    repetition_penalty: list[float] | None
    temperature: list[float] | None
    top_k: list[int] | None
    top_p: list[float] | None
    top_p_decay: list[float] | None
    top_p_min: list[float] | None
    top_p_reset_ids: list[int] | None
    def __eq__(self, arg0: SamplingConfig) -> bool:
        ...
    def __getstate__(self) -> tuple:
        ...
    def __init__(self, beam_width: int = 1) -> None:
        ...
    def __setstate__(self, arg0: tuple) -> None:
        ...
class TrtGptModelOptionalParams:
    __hash__: typing.ClassVar[None] = None
    decoding_config: executor.DecodingConfig
    device_ids: list[int] | None
    enable_chunked_context: bool
    enable_trt_overlap: bool
    gpu_weights_percent: float
    kv_cache_config: KvCacheConfig
    max_beam_width: int | None
    normalize_log_probs: bool
    scheduler_config: executor.SchedulerConfig
    def __eq__(self, arg0: TrtGptModelOptionalParams) -> bool:
        ...
    def __getstate__(self) -> tuple:
        ...
    def __init__(self, kv_cache_config: KvCacheConfig = ..., enable_trt_overlap: bool = False, device_ids: list[int] | None = None, normalize_log_probs: bool = True, enable_chunked_context: bool = False, peft_cache_manager_config: PeftCacheManagerConfig = ...) -> None:
        ...
    def __setstate__(self, arg0: tuple) -> None:
        ...
class TrtGptModelType:
    """
    Members:
    
      V1
    
      InflightBatching
    
      InflightFusedBatching
    """
    InflightBatching: typing.ClassVar[TrtGptModelType]  # value = <TrtGptModelType.InflightBatching: 1>
    InflightFusedBatching: typing.ClassVar[TrtGptModelType]  # value = <TrtGptModelType.InflightFusedBatching: 2>
    V1: typing.ClassVar[TrtGptModelType]  # value = <TrtGptModelType.V1: 0>
    __members__: typing.ClassVar[dict[str, TrtGptModelType]]  # value = {'V1': <TrtGptModelType.V1: 0>, 'InflightBatching': <TrtGptModelType.InflightBatching: 1>, 'InflightFusedBatching': <TrtGptModelType.InflightFusedBatching: 2>}
    def __eq__(self, other: typing.Any) -> bool:
        ...
    def __getstate__(self) -> int:
        ...
    def __hash__(self) -> int:
        ...
    def __index__(self) -> int:
        ...
    def __init__(self, value: int) -> None:
        ...
    def __int__(self) -> int:
        ...
    def __ne__(self, other: typing.Any) -> bool:
        ...
    def __repr__(self) -> str:
        ...
    def __setstate__(self, state: int) -> None:
        ...
    def __str__(self) -> str:
        ...
    @property
    def name(self) -> str:
        ...
    @property
    def value(self) -> int:
        ...
class WorldConfig:
    @staticmethod
    def mpi(gpus_per_node: int = 8, tensor_parallelism: int | None = None, pipeline_parallelism: int | None = None, device_ids: list[int] | None = None) -> WorldConfig:
        ...
    def __init__(self, tensor_parallelism: int = 1, pipeline_parallelism: int = 1, rank: int = 0, gpus_per_node: int = 8, device_ids: list[int] | None = None) -> None:
        ...
    @property
    def device(self) -> int:
        ...
    @property
    def gpus_per_group(self) -> int:
        ...
    @property
    def gpus_per_node(self) -> int:
        ...
    @property
    def is_pipeline_parallel(self) -> bool:
        ...
    @property
    def is_tensor_parallel(self) -> bool:
        ...
    @property
    def local_rank(self) -> int:
        ...
    @property
    def node_rank(self) -> int:
        ...
    @property
    def pipeline_parallel_rank(self) -> int:
        ...
    @property
    def pipeline_parallelism(self) -> int:
        ...
    @property
    def rank(self) -> int:
        ...
    @property
    def size(self) -> int:
        ...
    @property
    def tensor_parallel_rank(self) -> int:
        ...
    @property
    def tensor_parallelism(self) -> int:
        ...
BF16: DataType  # value = <DataType.BF16: 7>
BOOL: DataType  # value = <DataType.BOOL: 4>
FLOAT: DataType  # value = <DataType.FLOAT: 0>
FP8: DataType  # value = <DataType.FP8: 6>
HALF: DataType  # value = <DataType.HALF: 1>
INT32: DataType  # value = <DataType.INT32: 3>
INT64: DataType  # value = <DataType.INT64: 8>
INT8: DataType  # value = <DataType.INT8: 2>
UINT8: DataType  # value = <DataType.UINT8: 5>