Upload folder using huggingface_hub

5000658 verified over 1 year ago

24.3 kB

	"""
	TensorRT-LLM Python bindings for C++ runtime
	"""
	from __future__ import annotations
	import os
	import torch
	import typing
	from . import BuildInfo
	from . import executor
	from . import tensor_names
	__all__ = ['BF16', 'BOOL', 'BuildInfo', 'DataType', 'FLOAT', 'FP8', 'GptJsonConfig', 'GptManager', 'GptModelVariant', 'HALF', 'INT32', 'INT64', 'INT8', 'InferenceRequest', 'KvCacheConfig', 'LlmRequest', 'LlmRequestState', 'MemoryCounters', 'ModelConfig', 'MpiComm', 'NamedTensor', 'PeftCacheManagerConfig', 'QuantMode', 'SamplingConfig', 'TrtGptModelOptionalParams', 'TrtGptModelType', 'UINT8', 'WorldConfig', 'executor', 'tensor_names']
	class DataType:
	"""
	Members:

	FLOAT

	HALF

	INT8

	INT32

	BOOL

	UINT8

	FP8

	BF16

	INT64
	"""
	BF16: typing.ClassVar[DataType] # value = <DataType.BF16: 7>
	BOOL: typing.ClassVar[DataType] # value = <DataType.BOOL: 4>
	FLOAT: typing.ClassVar[DataType] # value = <DataType.FLOAT: 0>
	FP8: typing.ClassVar[DataType] # value = <DataType.FP8: 6>
	HALF: typing.ClassVar[DataType] # value = <DataType.HALF: 1>
	INT32: typing.ClassVar[DataType] # value = <DataType.INT32: 3>
	INT64: typing.ClassVar[DataType] # value = <DataType.INT64: 8>
	INT8: typing.ClassVar[DataType] # value = <DataType.INT8: 2>
	UINT8: typing.ClassVar[DataType] # value = <DataType.UINT8: 5>
	__members__: typing.ClassVar[dict[str, DataType]] # value = {'FLOAT': <DataType.FLOAT: 0>, 'HALF': <DataType.HALF: 1>, 'INT8': <DataType.INT8: 2>, 'INT32': <DataType.INT32: 3>, 'BOOL': <DataType.BOOL: 4>, 'UINT8': <DataType.UINT8: 5>, 'FP8': <DataType.FP8: 6>, 'BF16': <DataType.BF16: 7>, 'INT64': <DataType.INT64: 8>}
	def __eq__(self, other: typing.Any) -> bool:
	...
	def __getstate__(self) -> int:
	...
	def __hash__(self) -> int:
	...
	def __index__(self) -> int:
	...
	def __init__(self, value: int) -> None:
	...
	def __int__(self) -> int:
	...
	def __ne__(self, other: typing.Any) -> bool:
	...
	def __repr__(self) -> str:
	...
	def __setstate__(self, state: int) -> None:
	...
	def __str__(self) -> str:
	...
	@property
	def name(self) -> str:
	...
	@property
	def value(self) -> int:
	...
	class GptJsonConfig:
	@staticmethod
	def parse(json: str) -> GptJsonConfig:
	...
	@staticmethod
	def parse_file(path: os.PathLike) -> GptJsonConfig:
	...
	def __init__(self, name: str, version: str, precision: str, tensor_parallelism: int, pipeline_parallelism: int, gpus_per_node: int, model_config: ModelConfig) -> None:
	...
	@typing.overload
	def engine_filename(self, world_config: WorldConfig, model: str) -> str:
	...
	@typing.overload
	def engine_filename(self, world_config: WorldConfig) -> str:
	...
	@property
	def gpus_per_node(self) -> int:
	...
	@property
	def model_config(self) -> ModelConfig:
	...
	@property
	def name(self) -> str:
	...
	@property
	def pipeline_parallelism(self) -> int:
	...
	@property
	def precision(self) -> str:
	...
	@property
	def tensor_parallelism(self) -> int:
	...
	@property
	def version(self) -> str:
	...
	@property
	def world_size(self) -> int:
	...
	class GptManager:
	def __enter__(self) -> typing.Any:
	...
	def __exit__(self, arg0: typing.Any, arg1: typing.Any, arg2: typing.Any) -> None:
	...
	def __init__(self, trt_engine_path: os.PathLike, model_type: TrtGptModelType, get_inference_requests_cb: typing.Callable[[int], list[InferenceRequest]], send_response_cb: typing.Callable[[int, list[NamedTensor], bool, str], None], poll_stop_signal_cb: typing.Callable[[], set[int]] = None, return_batch_manager_stats_cb: typing.Callable[[str], None] = None, optional_params: TrtGptModelOptionalParams = ..., terminate_req_id: int \| None = None) -> None:
	...
	def shutdown(self) -> None:
	...
	class GptModelVariant:
	"""
	Members:

	GPT

	GLM

	CHATGLM

	MAMBA

	RECURRENTGEMMA
	"""
	CHATGLM: typing.ClassVar[GptModelVariant] # value = <GptModelVariant.CHATGLM: 1>
	GLM: typing.ClassVar[GptModelVariant] # value = <GptModelVariant.GLM: 2>
	GPT: typing.ClassVar[GptModelVariant] # value = <GptModelVariant.GPT: 0>
	MAMBA: typing.ClassVar[GptModelVariant] # value = <GptModelVariant.MAMBA: 3>
	RECURRENTGEMMA: typing.ClassVar[GptModelVariant] # value = <GptModelVariant.RECURRENTGEMMA: 4>
	__members__: typing.ClassVar[dict[str, GptModelVariant]] # value = {'GPT': <GptModelVariant.GPT: 0>, 'GLM': <GptModelVariant.GLM: 2>, 'CHATGLM': <GptModelVariant.CHATGLM: 1>, 'MAMBA': <GptModelVariant.MAMBA: 3>, 'RECURRENTGEMMA': <GptModelVariant.RECURRENTGEMMA: 4>}
	def __eq__(self, other: typing.Any) -> bool:
	...
	def __getstate__(self) -> int:
	...
	def __hash__(self) -> int:
	...
	def __index__(self) -> int:
	...
	def __init__(self, value: int) -> None:
	...
	def __int__(self) -> int:
	...
	def __ne__(self, other: typing.Any) -> bool:
	...
	def __repr__(self) -> str:
	...
	def __setstate__(self, state: int) -> None:
	...
	def __str__(self) -> str:
	...
	@property
	def name(self) -> str:
	...
	@property
	def value(self) -> int:
	...
	class InferenceRequest:
	bad_words_list: torch.Tensor
	beam_width: torch.Tensor
	draft_input_ids: torch.Tensor
	draft_logits: torch.Tensor
	early_stopping: torch.Tensor
	embedding_bias: torch.Tensor
	end_id: torch.Tensor
	frequency_penalty: torch.Tensor
	input_ids: torch.Tensor
	is_streaming: bool
	length_penalty: torch.Tensor
	lora_config: torch.Tensor
	lora_task_id: torch.Tensor
	lora_weights: torch.Tensor
	max_new_tokens: torch.Tensor
	min_length: torch.Tensor
	no_repeat_ngram_size: torch.Tensor
	pad_id: torch.Tensor
	presence_penalty: torch.Tensor
	prompt_embedding_table: torch.Tensor
	prompt_vocab_size: torch.Tensor
	random_seed: torch.Tensor
	repetition_penalty: torch.Tensor
	return_context_logits: torch.Tensor
	return_generation_logits: torch.Tensor
	return_log_probs: torch.Tensor
	runtime_top_k: torch.Tensor
	runtime_top_p: torch.Tensor
	stop_words_list: torch.Tensor
	temperature: torch.Tensor
	def __getstate__(self) -> bytearray:
	...
	@typing.overload
	def __init__(self, request_id: int, logits_post_processor_callback: typing.Callable[[int, torch.Tensor, list[list[int]], torch.Stream, int \| None], None] \| None = None) -> None:
	...
	@typing.overload
	def __init__(self, arg0: int, arg1: dict[str, torch.Tensor]) -> None:
	"""
	deprecated: use direct tensor access instead
	"""
	def __setstate__(self, arg0: bytearray) -> None:
	...
	@property
	def request_id(self) -> int:
	...
	class KvCacheConfig:
	__hash__: typing.ClassVar[None] = None
	enable_block_reuse: bool
	free_gpu_memory_fraction: float \| None
	max_attention_window: int \| None
	max_tokens: int \| None
	sink_token_length: int \| None
	def __eq__(self, arg0: KvCacheConfig) -> bool:
	...
	def __getstate__(self) -> tuple:
	...
	def __init__(self, max_tokens: int \| None = None, max_attention_window: int \| None = None, sink_token_length: int \| None = None, free_gpu_memory_fraction: float \| None = None, enable_block_reuse: bool = False) -> None:
	...
	def __setstate__(self, arg0: tuple) -> None:
	...
	class LlmRequest:
	context_chunk_size: int
	draft_tokens: list[int]
	end_id: int \| None
	is_streaming: bool
	max_new_tokens: int
	max_sent_token_len: int
	pad_id: int \| None
	prompt_len: int
	request_id: int
	sampling_config: SamplingConfig
	seq_slot: int \| None
	state: LlmRequestState
	def __init__(self, request_id: int, max_new_tokens: int, input_tokens: list[int], sampling_config: SamplingConfig, is_streaming: bool, end_id: int \| None = None, pad_id: int \| None = None, embedding_bias: torch.Tensor \| None = None, bad_words_list: torch.Tensor \| None = None, stop_words_list: torch.Tensor \| None = None, prompt_embedding_table: torch.Tensor \| None = None, prompt_vocab_size: int \| None = None, lora_task_id: int \| None = None, lora_weights: torch.Tensor \| None = None, lora_config: torch.Tensor \| None = None, return_log_probs: bool = False, return_context_logits: bool = False, return_generation_logits: bool = False, draft_tokens: list[int] \| None = None, draft_logits: torch.Tensor \| None = None, exclude_input_from_output: bool = False, logits_post_processor: typing.Callable[[int, torch.Tensor, list[list[int]], torch.Stream, int \| None], None] \| None = None) -> None:
	...
	def add_new_token(self, token: int, beam: int) -> None:
	...
	def add_new_tokens(self, beam_tokens: list[int]) -> None:
	...
	def get_context_remaining_length(self) -> int:
	...
	def get_log_probs(self, arg0: int) -> list[float]:
	...
	def get_num_tokens(self, beam: int) -> int:
	...
	def get_token(self, beam: int, pos: int) -> int:
	...
	@typing.overload
	def get_tokens(self, beam: int) -> list[int]:
	...
	@typing.overload
	def get_tokens(self) -> list[list[int]]:
	...
	def has_draft_tokens(self) -> bool:
	...
	def is_first_context_chunk(self) -> bool:
	...
	def is_full_context_request(self) -> bool:
	...
	def is_last_context_chunk(self) -> bool:
	...
	def move_to_next_context_chunk(self) -> None:
	...
	def pause(self, max_input_len: int) -> None:
	...
	def set_cum_log_prob(self, cum_log_prob: float, beam: int) -> None:
	...
	def set_generated_tokens(self, generated_beam_tokens: list[list[int]]) -> None:
	...
	def set_log_probs(self, log_probs: list[float], beam: int) -> None:
	...
	@property
	def bad_words_list(self) -> torch.Tensor \| None:
	...
	@property
	def context_current_position(self) -> int:
	...
	@property
	def cum_log_probs(self) -> list[float]:
	...
	@property
	def draft_logits(self) -> torch.Tensor \| None:
	...
	@draft_logits.setter
	def draft_logits(self, arg1: torch.Tensor) -> None:
	...
	@property
	def embedding_bias(self) -> torch.Tensor \| None:
	...
	@property
	def log_probs(self) -> list[list[float]]:
	...
	@property
	def lora_config(self) -> torch.Tensor \| None:
	...
	@property
	def lora_task_id(self) -> int \| None:
	...
	@property
	def lora_weights(self) -> torch.Tensor \| None:
	...
	@property
	def max_beam_num_tokens(self) -> int:
	...
	@property
	def max_num_generated_tokens(self) -> int:
	...
	@property
	def orig_prompt_len(self) -> int:
	...
	@property
	def prompt_embedding_table(self) -> torch.Tensor \| None:
	...
	@property
	def prompt_vocab_size(self) -> int \| None:
	...
	@property
	def return_context_logits(self, arg1: bool) -> None:
	...
	@property
	def return_generation_logits(self, arg1: bool) -> None:
	...
	@property
	def return_log_probs(self) -> bool:
	...
	@property
	def stop_words_list(self) -> torch.Tensor \| None:
	...
	class LlmRequestState:
	"""
	Members:

	REQUEST_STATE_UNKNOWN

	REQUEST_STATE_ENCODER_INIT

	REQUEST_STATE_CONTEXT_INIT

	REQUEST_STATE_GENERATION_IN_PROGRESS

	REQUEST_STATE_GENERATION_TO_COMPLETE

	REQUEST_STATE_GENERATION_COMPLETE
	"""
	REQUEST_STATE_CONTEXT_INIT: typing.ClassVar[LlmRequestState] # value = <LlmRequestState.REQUEST_STATE_CONTEXT_INIT: 2>
	REQUEST_STATE_ENCODER_INIT: typing.ClassVar[LlmRequestState] # value = <LlmRequestState.REQUEST_STATE_ENCODER_INIT: 1>
	REQUEST_STATE_GENERATION_COMPLETE: typing.ClassVar[LlmRequestState] # value = <LlmRequestState.REQUEST_STATE_GENERATION_COMPLETE: 5>
	REQUEST_STATE_GENERATION_IN_PROGRESS: typing.ClassVar[LlmRequestState] # value = <LlmRequestState.REQUEST_STATE_GENERATION_IN_PROGRESS: 3>
	REQUEST_STATE_GENERATION_TO_COMPLETE: typing.ClassVar[LlmRequestState] # value = <LlmRequestState.REQUEST_STATE_GENERATION_TO_COMPLETE: 4>
	REQUEST_STATE_UNKNOWN: typing.ClassVar[LlmRequestState] # value = <LlmRequestState.REQUEST_STATE_UNKNOWN: 0>
	__members__: typing.ClassVar[dict[str, LlmRequestState]] # value = {'REQUEST_STATE_UNKNOWN': <LlmRequestState.REQUEST_STATE_UNKNOWN: 0>, 'REQUEST_STATE_ENCODER_INIT': <LlmRequestState.REQUEST_STATE_ENCODER_INIT: 1>, 'REQUEST_STATE_CONTEXT_INIT': <LlmRequestState.REQUEST_STATE_CONTEXT_INIT: 2>, 'REQUEST_STATE_GENERATION_IN_PROGRESS': <LlmRequestState.REQUEST_STATE_GENERATION_IN_PROGRESS: 3>, 'REQUEST_STATE_GENERATION_TO_COMPLETE': <LlmRequestState.REQUEST_STATE_GENERATION_TO_COMPLETE: 4>, 'REQUEST_STATE_GENERATION_COMPLETE': <LlmRequestState.REQUEST_STATE_GENERATION_COMPLETE: 5>}
	def __eq__(self, other: typing.Any) -> bool:
	...
	def __getstate__(self) -> int:
	...
	def __hash__(self) -> int:
	...
	def __index__(self) -> int:
	...
	def __init__(self, value: int) -> None:
	...
	def __int__(self) -> int:
	...
	def __ne__(self, other: typing.Any) -> bool:
	...
	def __repr__(self) -> str:
	...
	def __setstate__(self, state: int) -> None:
	...
	def __str__(self) -> str:
	...
	@property
	def name(self) -> str:
	...
	@property
	def value(self) -> int:
	...
	class MemoryCounters:
	@staticmethod
	def instance() -> MemoryCounters:
	...
	@property
	def cpu(self) -> int:
	...
	@property
	def gpu(self) -> int:
	...
	@property
	def pinned(self) -> int:
	...
	@property
	def uvm(self) -> int:
	...
	class ModelConfig:
	compute_context_logits: bool
	compute_generation_logits: bool
	head_size: int
	max_batch_size: int
	max_beam_width: int
	max_input_len: int
	max_num_tokens: int \| None
	max_prompt_embedding_table_size: int
	model_variant: GptModelVariant
	num_kv_heads: int
	quant_mode: QuantMode
	tokens_per_block: int
	use_gpt_attention_plugin: bool
	use_packed_input: bool
	use_paged_kv_cache: bool
	def __init__(self, vocab_size: int, num_attention_layers: int, num_rnn_layers: int, num_heads: int, hidden_size: int, data_type: DataType) -> None:
	...
	def num_attention_layers(self, pipeline_parallelism: int = 1) -> int:
	...
	def num_rnn_layers(self, pipeline_parallelism: int = 1) -> int:
	...
	def vocab_size_padded(self, world_size: int) -> int:
	...
	@property
	def data_type(self) -> DataType:
	...
	@property
	def hidden_size(self) -> int:
	...
	@property
	def max_seq_len(self) -> int:
	...
	@max_seq_len.setter
	def max_seq_len(self) -> int:
	...
	@property
	def num_heads(self) -> int:
	...
	@property
	def size_per_head(self) -> int:
	...
	@property
	def supports_inflight_batching(self) -> bool:
	...
	@property
	def use_prompt_tuning(self) -> bool:
	...
	@property
	def vocab_size(self) -> int:
	...
	class MpiComm:
	@staticmethod
	def local_init() -> None:
	...
	@staticmethod
	def local_size() -> int:
	...
	@staticmethod
	def rank() -> int:
	...
	@staticmethod
	def size() -> int:
	...
	@staticmethod
	def split(arg0: int, arg1: int) -> None:
	...
	class NamedTensor:
	tensor: torch.Tensor \| None
	def __init__(self, tensor: torch.Tensor \| None, name: str) -> None:
	...
	@property
	def name(self) -> str:
	...
	class PeftCacheManagerConfig:
	device_cache_percent: float \| None
	host_cache_size: int \| None
	max_adapter_size: int
	max_pages_per_block_device: int
	max_pages_per_block_host: int
	num_copy_streams: int
	num_device_module_layer: int
	num_ensure_workers: int
	num_host_module_layer: int
	num_put_workers: int
	optimal_adapter_size: int
	def __init__(self, num_host_module_layer: int = 0, num_device_module_layer: int = 0, optimal_adapter_size: int = 8, max_adapter_size: int = 64, num_put_workers: int = 1, num_ensure_workers: int = 1, num_copy_streams: int = 1, max_pages_per_block_host: int = 24, max_pages_per_block_device: int = 8, device_cache_percent: float \| None = None, host_cache_size: int \| None = None) -> None:
	...
	class QuantMode:
	__hash__: typing.ClassVar[None] = None
	@staticmethod
	def activations() -> QuantMode:
	...
	@staticmethod
	def fp8_kv_cache() -> QuantMode:
	...
	@staticmethod
	def fp8_qdq() -> QuantMode:
	...
	@staticmethod
	def from_description(quantize_weights: bool = False, quantize_activations: bool = False, per_token: bool = False, per_channel: bool = False, per_group: bool = False, use_int4_weights: bool = False, use_int8_kv_cache: bool = False, use_fp8_kv_kache: bool = False, use_fp8_qdq: bool = False, use_fp8_rowwise: bool = False) -> QuantMode:
	...
	@staticmethod
	def from_quant_algo(quant_algo: str \| None = None, kv_cache_quant_algo: str \| None = None) -> QuantMode:
	...
	@staticmethod
	def int4_weights() -> QuantMode:
	...
	@staticmethod
	def int8_kv_cache() -> QuantMode:
	...
	@staticmethod
	def int8_weights() -> QuantMode:
	...
	@staticmethod
	def none() -> QuantMode:
	...
	@staticmethod
	def per_channel_scaling() -> QuantMode:
	...
	@staticmethod
	def per_group_scaling() -> QuantMode:
	...
	@staticmethod
	def per_token_scaling() -> QuantMode:
	...
	@staticmethod
	def use_smooth_quant(per_token: bool = False, per_channel: bool = False) -> QuantMode:
	...
	@staticmethod
	def use_weight_only(use_int4_weights: bool = False, per_group: bool = False) -> QuantMode:
	...
	def __add__(self, arg0: QuantMode) -> QuantMode:
	...
	def __eq__(self, arg0: QuantMode) -> bool:
	...
	def __iadd__(self, arg0: QuantMode) -> QuantMode:
	...
	def __isub__(self, arg0: QuantMode) -> QuantMode:
	...
	def __ne__(self, arg0: QuantMode) -> bool:
	...
	def __sub__(self, arg0: QuantMode) -> QuantMode:
	...
	def is_set(self, mode: QuantMode) -> bool:
	...
	@property
	def has_activations(self) -> bool:
	...
	@property
	def has_fp8_kv_cache(self) -> bool:
	...
	@property
	def has_fp8_qdq(self) -> bool:
	...
	@property
	def has_int4_weights(self) -> bool:
	...
	@property
	def has_int8_kv_cache(self) -> bool:
	...
	@property
	def has_int8_weights(self) -> bool:
	...
	@property
	def has_kv_cache_quant(self) -> bool:
	...
	@property
	def has_per_channel_scaling(self) -> bool:
	...
	@property
	def has_per_group_scaling(self) -> bool:
	...
	@property
	def has_per_token_scaling(self) -> bool:
	...
	@property
	def has_static_activation_scaling(self) -> bool:
	...
	@property
	def value(self) -> int:
	...
	class SamplingConfig:
	__hash__: typing.ClassVar[None] = None
	beam_search_diversity_rate: list[float] \| None
	beam_width: int
	early_stopping: list[int] \| None
	frequency_penalty: list[float] \| None
	length_penalty: list[float] \| None
	min_length: list[int] \| None
	no_repeat_ngram_size: list[int] \| None
	presence_penalty: list[float] \| None
	random_seed: list[int] \| None
	repetition_penalty: list[float] \| None
	temperature: list[float] \| None
	top_k: list[int] \| None
	top_p: list[float] \| None
	top_p_decay: list[float] \| None
	top_p_min: list[float] \| None
	top_p_reset_ids: list[int] \| None
	def __eq__(self, arg0: SamplingConfig) -> bool:
	...
	def __getstate__(self) -> tuple:
	...
	def __init__(self, beam_width: int = 1) -> None:
	...
	def __setstate__(self, arg0: tuple) -> None:
	...
	class TrtGptModelOptionalParams:
	__hash__: typing.ClassVar[None] = None
	decoding_config: executor.DecodingConfig
	device_ids: list[int] \| None
	enable_chunked_context: bool
	enable_trt_overlap: bool
	gpu_weights_percent: float
	kv_cache_config: KvCacheConfig
	max_beam_width: int \| None
	normalize_log_probs: bool
	scheduler_config: executor.SchedulerConfig
	def __eq__(self, arg0: TrtGptModelOptionalParams) -> bool:
	...
	def __getstate__(self) -> tuple:
	...
	def __init__(self, kv_cache_config: KvCacheConfig = ..., enable_trt_overlap: bool = False, device_ids: list[int] \| None = None, normalize_log_probs: bool = True, enable_chunked_context: bool = False, peft_cache_manager_config: PeftCacheManagerConfig = ...) -> None:
	...
	def __setstate__(self, arg0: tuple) -> None:
	...
	class TrtGptModelType:
	"""
	Members:

	V1

	InflightBatching

	InflightFusedBatching
	"""
	InflightBatching: typing.ClassVar[TrtGptModelType] # value = <TrtGptModelType.InflightBatching: 1>
	InflightFusedBatching: typing.ClassVar[TrtGptModelType] # value = <TrtGptModelType.InflightFusedBatching: 2>
	V1: typing.ClassVar[TrtGptModelType] # value = <TrtGptModelType.V1: 0>
	__members__: typing.ClassVar[dict[str, TrtGptModelType]] # value = {'V1': <TrtGptModelType.V1: 0>, 'InflightBatching': <TrtGptModelType.InflightBatching: 1>, 'InflightFusedBatching': <TrtGptModelType.InflightFusedBatching: 2>}
	def __eq__(self, other: typing.Any) -> bool:
	...
	def __getstate__(self) -> int:
	...
	def __hash__(self) -> int:
	...
	def __index__(self) -> int:
	...
	def __init__(self, value: int) -> None:
	...
	def __int__(self) -> int:
	...
	def __ne__(self, other: typing.Any) -> bool:
	...
	def __repr__(self) -> str:
	...
	def __setstate__(self, state: int) -> None:
	...
	def __str__(self) -> str:
	...
	@property
	def name(self) -> str:
	...
	@property
	def value(self) -> int:
	...
	class WorldConfig:
	@staticmethod
	def mpi(gpus_per_node: int = 8, tensor_parallelism: int \| None = None, pipeline_parallelism: int \| None = None, device_ids: list[int] \| None = None) -> WorldConfig:
	...
	def __init__(self, tensor_parallelism: int = 1, pipeline_parallelism: int = 1, rank: int = 0, gpus_per_node: int = 8, device_ids: list[int] \| None = None) -> None:
	...
	@property
	def device(self) -> int:
	...
	@property
	def gpus_per_group(self) -> int:
	...
	@property
	def gpus_per_node(self) -> int:
	...
	@property
	def is_pipeline_parallel(self) -> bool:
	...
	@property
	def is_tensor_parallel(self) -> bool:
	...
	@property
	def local_rank(self) -> int:
	...
	@property
	def node_rank(self) -> int:
	...
	@property
	def pipeline_parallel_rank(self) -> int:
	...
	@property
	def pipeline_parallelism(self) -> int:
	...
	@property
	def rank(self) -> int:
	...
	@property
	def size(self) -> int:
	...
	@property
	def tensor_parallel_rank(self) -> int:
	...
	@property
	def tensor_parallelism(self) -> int:
	...
	BF16: DataType # value = <DataType.BF16: 7>
	BOOL: DataType # value = <DataType.BOOL: 4>
	FLOAT: DataType # value = <DataType.FLOAT: 0>
	FP8: DataType # value = <DataType.FP8: 6>
	HALF: DataType # value = <DataType.HALF: 1>
	INT32: DataType # value = <DataType.INT32: 3>
	INT64: DataType # value = <DataType.INT64: 8>
	INT8: DataType # value = <DataType.INT8: 2>
	UINT8: DataType # value = <DataType.UINT8: 5>