TensorRT-LLM-Windows-RTX40 / scripts /start_server_5000.py
xThr45hx's picture
Add source, patches, scripts, build notes, README, LICENSE
00db36f verified
Raw
History Blame Contribute Delete
1.04 kB
from tensorrt_llm.commands.serve import get_llm_args, launch_server
if __name__ == '__main__':
llm_args, _ = get_llm_args(
model=r'D:\AI\models\Josiefied-Qwen3-4B-ablit-gptq-engine',
tokenizer=r'D:\AI\models\Josiefied-Qwen3-4B-abliterated-v1',
backend='tensorrt',
max_beam_width=1,
max_batch_size=1,
max_num_tokens=16384,
max_seq_len=16384,
tensor_parallel_size=1,
pipeline_parallel_size=1,
context_parallel_size=1,
moe_expert_parallel_size=None,
gpus_per_node=1,
free_gpu_memory_fraction=0.9,
kv_cache_dtype='auto',
num_postprocess_workers=0,
trust_remote_code=False,
revision=None,
reasoning_parser='qwen3',
fail_fast_on_attention_window_too_large=False,
otlp_traces_endpoint=None,
enable_chunked_prefill=True,
)
launch_server(
host='localhost',
port=5001,
llm_args=llm_args,
served_model_name='Josiefied-Qwen3-4B',
)